2023-06-26 12:41:03 +03:00
|
|
|
|
from ...symbols import NORM, ORTH
|
2020-07-22 23:18:46 +03:00
|
|
|
|
from ...util import update_exc
|
2023-06-26 12:41:03 +03:00
|
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
|
2018-07-10 14:48:38 +03:00
|
|
|
|
_exc = {}
|
|
|
|
|
|
|
|
|
|
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "από"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Αλλ'", "αλλ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "αλλά"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "παρά"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["καθ'", "Καθ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "κάθε"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["κατ'", "Κατ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "κατά"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "είμαι"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "επί"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Δι'", "δι'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "δια"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "έχω"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["υπ'", "Υπ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "υπό"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "μετά"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Μ'", "μ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "με"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Γι'", "ΓΙ'", "γι'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "για"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Σ'", "σ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "σε"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Θ'", "θ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "θα"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Ν'", "ν'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "να"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Τ'", "τ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "να"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["'γω", "'σένα", "'μεις"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "εγώ"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Τ'", "τ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "το"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "φέρνω"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "έρχομαι"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "λέγω"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Πάρ'", "πάρ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "παίρνω"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["μέσ'", "Μέσ'", "μεσ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "μέσα"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "δένω"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for token in ["'κανε", "Κάν'"]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[token] = [{ORTH: token, NORM: "κάνω"}]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
_other_exc = {
|
2020-07-23 00:09:01 +03:00
|
|
|
|
"κι": [{ORTH: "κι", NORM: "και"}],
|
|
|
|
|
"Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}],
|
|
|
|
|
"Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}],
|
|
|
|
|
"ολ'": [{ORTH: "ολ'", NORM: "όλος"}],
|
|
|
|
|
"ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}],
|
|
|
|
|
"'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}],
|
|
|
|
|
"Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}],
|
|
|
|
|
"'θελα": [{ORTH: "'θελα", NORM: "θέλω"}],
|
|
|
|
|
"'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}],
|
|
|
|
|
"'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}],
|
|
|
|
|
"'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}],
|
|
|
|
|
"όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}],
|
|
|
|
|
"κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}],
|
|
|
|
|
"μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}],
|
|
|
|
|
"'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}],
|
|
|
|
|
"'μας": [{ORTH: "'μας", NORM: "εμάς"}],
|
|
|
|
|
"'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}],
|
|
|
|
|
"έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}],
|
|
|
|
|
"εξ'": [{ORTH: "εξ'", NORM: "εκ"}],
|
|
|
|
|
"δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}],
|
|
|
|
|
"τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}],
|
|
|
|
|
"Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}],
|
|
|
|
|
"άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}],
|
|
|
|
|
"Στ'": [{ORTH: "Στ'", NORM: "στο"}],
|
|
|
|
|
"Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}],
|
|
|
|
|
"Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}],
|
|
|
|
|
"Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}],
|
|
|
|
|
"Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}],
|
|
|
|
|
"Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}],
|
|
|
|
|
"'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}],
|
|
|
|
|
"'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}],
|
|
|
|
|
"'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}],
|
|
|
|
|
"'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}],
|
|
|
|
|
"'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}],
|
|
|
|
|
"'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}],
|
|
|
|
|
"'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}],
|
|
|
|
|
"'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}],
|
|
|
|
|
"'κει": [{ORTH: "'κει", NORM: "εκεί"}],
|
|
|
|
|
"'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}],
|
|
|
|
|
"'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}],
|
2018-07-10 14:48:38 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_exc.update(_other_exc)
|
|
|
|
|
|
|
|
|
|
for h in range(1, 12 + 1):
|
|
|
|
|
|
|
|
|
|
for period in ["π.μ.", "πμ"]:
|
2019-12-25 19:59:52 +03:00
|
|
|
|
_exc[f"{h}{period}"] = [
|
|
|
|
|
{ORTH: f"{h}"},
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: period, NORM: "π.μ."},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for period in ["μ.μ.", "μμ"]:
|
2019-12-25 19:59:52 +03:00
|
|
|
|
_exc[f"{h}{period}"] = [
|
|
|
|
|
{ORTH: f"{h}"},
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: period, NORM: "μ.μ."},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
]
|
2018-07-10 14:48:38 +03:00
|
|
|
|
|
|
|
|
|
for exc_data in [
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: "ΑΓΡ.", NORM: "Αγροτικός"},
|
|
|
|
|
{ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"},
|
|
|
|
|
{ORTH: "Αθ.", NORM: "Αθανάσιος"},
|
|
|
|
|
{ORTH: "Αλεξ.", NORM: "Αλέξανδρος"},
|
|
|
|
|
{ORTH: "Απρ.", NORM: "Απρίλιος"},
|
|
|
|
|
{ORTH: "Αύγ.", NORM: "Αύγουστος"},
|
|
|
|
|
{ORTH: "Δεκ.", NORM: "Δεκέμβριος"},
|
|
|
|
|
{ORTH: "Δημ.", NORM: "Δήμος"},
|
|
|
|
|
{ORTH: "Ιαν.", NORM: "Ιανουάριος"},
|
|
|
|
|
{ORTH: "Ιούλ.", NORM: "Ιούλιος"},
|
|
|
|
|
{ORTH: "Ιούν.", NORM: "Ιούνιος"},
|
|
|
|
|
{ORTH: "Ιωαν.", NORM: "Ιωάννης"},
|
|
|
|
|
{ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"},
|
|
|
|
|
{ORTH: "Μάρτ.", NORM: "Μάρτιος"},
|
|
|
|
|
{ORTH: "Μάρτ'", NORM: "Μάρτιος"},
|
|
|
|
|
{ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"},
|
|
|
|
|
{ORTH: "Οκτ.", NORM: "Οκτώβριος"},
|
|
|
|
|
{ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"},
|
|
|
|
|
{ORTH: "Φεβρ.", NORM: "Φεβρουάριος"},
|
2018-07-10 14:48:38 +03:00
|
|
|
|
]:
|
|
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
|
|
|
|
|
|
|
|
|
for orth in [
|
|
|
|
|
"$ΗΠΑ",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"Α'",
|
|
|
|
|
"Α.Ε.",
|
|
|
|
|
"Α.Ε.Β.Ε.",
|
|
|
|
|
"Α.Ε.Ι.",
|
|
|
|
|
"Α.Ε.Π.",
|
|
|
|
|
"Α.Μ.Α.",
|
|
|
|
|
"Α.Π.Θ.",
|
|
|
|
|
"Α.Τ.",
|
|
|
|
|
"Α.Χ.",
|
|
|
|
|
"ΑΝ.",
|
|
|
|
|
"Αγ.",
|
|
|
|
|
"Αλ.",
|
|
|
|
|
"Αν.",
|
|
|
|
|
"Αντ.",
|
|
|
|
|
"Απ.",
|
|
|
|
|
"Β'",
|
|
|
|
|
"Β)",
|
|
|
|
|
"Β.Ζ.",
|
|
|
|
|
"Β.Ι.Ο.",
|
|
|
|
|
"Β.Κ.",
|
|
|
|
|
"Β.Μ.Α.",
|
|
|
|
|
"Βασ.",
|
|
|
|
|
"Γ'",
|
|
|
|
|
"Γ)",
|
|
|
|
|
"Γ.Γ.",
|
|
|
|
|
"Γ.Δ.",
|
|
|
|
|
"Γκ.",
|
|
|
|
|
"Δ.Ε.Η.",
|
|
|
|
|
"Δ.Ε.Σ.Ε.",
|
|
|
|
|
"Δ.Ν.",
|
|
|
|
|
"Δ.Ο.Υ.",
|
|
|
|
|
"Δ.Σ.",
|
|
|
|
|
"Δ.Υ.",
|
|
|
|
|
"ΔΙ.ΚΑ.Τ.Σ.Α.",
|
|
|
|
|
"Δηλ.",
|
|
|
|
|
"Διον.",
|
|
|
|
|
"Ε.Α.",
|
|
|
|
|
"Ε.Α.Κ.",
|
|
|
|
|
"Ε.Α.Π.",
|
|
|
|
|
"Ε.Ε.",
|
|
|
|
|
"Ε.Κ.",
|
|
|
|
|
"Ε.ΚΕ.ΠΙΣ.",
|
|
|
|
|
"Ε.Λ.Α.",
|
|
|
|
|
"Ε.Λ.Ι.Α.",
|
|
|
|
|
"Ε.Π.Σ.",
|
|
|
|
|
"Ε.Π.Τ.Α.",
|
|
|
|
|
"Ε.Σ.Ε.Ε.Κ.",
|
|
|
|
|
"Ε.Υ.Κ.",
|
|
|
|
|
"ΕΕ.",
|
|
|
|
|
"ΕΚ.",
|
|
|
|
|
"ΕΛ.",
|
|
|
|
|
"ΕΛ.ΑΣ.",
|
|
|
|
|
"Εθν.",
|
|
|
|
|
"Ελ.",
|
|
|
|
|
"Εμ.",
|
|
|
|
|
"Επ.",
|
|
|
|
|
"Ευ.",
|
|
|
|
|
"Η'",
|
|
|
|
|
"Η.Π.Α.",
|
|
|
|
|
"ΘΕ.",
|
|
|
|
|
"Θεμ.",
|
|
|
|
|
"Θεοδ.",
|
|
|
|
|
"Θρ.",
|
|
|
|
|
"Ι.Ε.Κ.",
|
|
|
|
|
"Ι.Κ.Α.",
|
|
|
|
|
"Ι.Κ.Υ.",
|
|
|
|
|
"Ι.Σ.Θ.",
|
|
|
|
|
"Ι.Χ.",
|
|
|
|
|
"ΙΖ'",
|
|
|
|
|
"ΙΧ.",
|
|
|
|
|
"Κ.Α.Α.",
|
|
|
|
|
"Κ.Α.Ε.",
|
|
|
|
|
"Κ.Β.Σ.",
|
|
|
|
|
"Κ.Δ.",
|
|
|
|
|
"Κ.Ε.",
|
|
|
|
|
"Κ.Ε.Κ.",
|
|
|
|
|
"Κ.Ι.",
|
|
|
|
|
"Κ.Κ.",
|
|
|
|
|
"Κ.Ι.Θ.",
|
|
|
|
|
"Κ.Ι.Θ.",
|
|
|
|
|
"Κ.ΚΕΚ.",
|
|
|
|
|
"Κ.Ο.",
|
|
|
|
|
"Κ.Π.Ρ.",
|
|
|
|
|
"ΚΑΤ.",
|
|
|
|
|
"ΚΚ.",
|
|
|
|
|
"Καν.",
|
|
|
|
|
"Καρ.",
|
|
|
|
|
"Κατ.",
|
|
|
|
|
"Κυρ.",
|
|
|
|
|
"Κων.",
|
|
|
|
|
"Λ.Α.",
|
|
|
|
|
"Λ.χ.",
|
|
|
|
|
"Λ.Χ.",
|
|
|
|
|
"Λεωφ.",
|
|
|
|
|
"Λι.",
|
|
|
|
|
"Μ.Δ.Ε.",
|
|
|
|
|
"Μ.Ε.Ο.",
|
|
|
|
|
"Μ.Ζ.",
|
|
|
|
|
"Μ.Μ.Ε.",
|
|
|
|
|
"Μ.Ο.",
|
|
|
|
|
"Μεγ.",
|
|
|
|
|
"Μιλτ.",
|
|
|
|
|
"Μιχ.",
|
|
|
|
|
"Ν.Δ.",
|
|
|
|
|
"Ν.Ε.Α.",
|
|
|
|
|
"Ν.Κ.",
|
|
|
|
|
"Ν.Ο.",
|
|
|
|
|
"Ν.Ο.Θ.",
|
|
|
|
|
"Ν.Π.Δ.Δ.",
|
|
|
|
|
"Ν.Υ.",
|
|
|
|
|
"ΝΔ.",
|
|
|
|
|
"Νικ.",
|
|
|
|
|
"Ντ'",
|
|
|
|
|
"Ντ.",
|
|
|
|
|
"Ο'",
|
|
|
|
|
"Ο.Α.",
|
|
|
|
|
"Ο.Α.Ε.Δ.",
|
|
|
|
|
"Ο.Δ.",
|
|
|
|
|
"Ο.Ε.Ε.",
|
|
|
|
|
"Ο.Ε.Ε.Κ.",
|
|
|
|
|
"Ο.Η.Ε.",
|
|
|
|
|
"Ο.Κ.",
|
|
|
|
|
"Π.Δ.",
|
|
|
|
|
"Π.Ε.Κ.Δ.Υ.",
|
|
|
|
|
"Π.Ε.Π.",
|
|
|
|
|
"Π.Μ.Σ.",
|
|
|
|
|
"ΠΟΛ.",
|
|
|
|
|
"Π.Χ.",
|
|
|
|
|
"Παρ.",
|
|
|
|
|
"Πλ.",
|
|
|
|
|
"Πρ.",
|
|
|
|
|
"Σ.Δ.Ο.Ε.",
|
|
|
|
|
"Σ.Ε.",
|
|
|
|
|
"Σ.Ε.Κ.",
|
|
|
|
|
"Σ.Π.Δ.Ω.Β.",
|
|
|
|
|
"Σ.Τ.",
|
|
|
|
|
"Σαβ.",
|
|
|
|
|
"Στ.",
|
|
|
|
|
"ΣτΕ.",
|
|
|
|
|
"Στρ.",
|
|
|
|
|
"Τ.Α.",
|
|
|
|
|
"Τ.Ε.Ε.",
|
|
|
|
|
"Τ.Ε.Ι.",
|
|
|
|
|
"ΤΡ.",
|
|
|
|
|
"Τζ.",
|
|
|
|
|
"Τηλ.",
|
|
|
|
|
"Υ.Γ.",
|
|
|
|
|
"ΥΓ.",
|
|
|
|
|
"ΥΠ.Ε.Π.Θ.",
|
|
|
|
|
"Φ.Α.Β.Ε.",
|
|
|
|
|
"Φ.Κ.",
|
|
|
|
|
"Φ.Σ.",
|
|
|
|
|
"Φ.Χ.",
|
|
|
|
|
"Φ.Π.Α.",
|
|
|
|
|
"Φιλ.",
|
|
|
|
|
"Χ.Α.Α.",
|
|
|
|
|
"ΧΡ.",
|
|
|
|
|
"Χ.Χ.",
|
|
|
|
|
"Χαρ.",
|
|
|
|
|
"Χιλ.",
|
|
|
|
|
"Χρ.",
|
|
|
|
|
"άγ.",
|
|
|
|
|
"άρθρ.",
|
|
|
|
|
"αι.",
|
|
|
|
|
"αν.",
|
|
|
|
|
"απ.",
|
|
|
|
|
"αρ.",
|
|
|
|
|
"αριθ.",
|
|
|
|
|
"αριθμ.",
|
|
|
|
|
"β'",
|
|
|
|
|
"βλ.",
|
|
|
|
|
"γ.γ.",
|
|
|
|
|
"γεν.",
|
|
|
|
|
"γραμμ.",
|
|
|
|
|
"δ.δ.",
|
|
|
|
|
"δ.σ.",
|
|
|
|
|
"δηλ.",
|
|
|
|
|
"δισ.",
|
|
|
|
|
"δολ.",
|
|
|
|
|
"δρχ.",
|
|
|
|
|
"εκ.",
|
|
|
|
|
"εκατ.",
|
|
|
|
|
"ελ.",
|
2018-07-10 14:48:38 +03:00
|
|
|
|
"θιν'",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"κ.",
|
|
|
|
|
"κ.ά.",
|
|
|
|
|
"κ.α.",
|
|
|
|
|
"κ.κ.",
|
|
|
|
|
"κ.λπ.",
|
|
|
|
|
"κ.ο.κ.",
|
|
|
|
|
"κ.τ.λ.",
|
|
|
|
|
"κλπ.",
|
|
|
|
|
"κτλ.",
|
|
|
|
|
"κυβ.",
|
2018-07-10 14:48:38 +03:00
|
|
|
|
"λ.χ.",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"μ.",
|
|
|
|
|
"μ.Χ.",
|
|
|
|
|
"μ.μ.",
|
|
|
|
|
"μιλ.",
|
2018-07-10 14:48:38 +03:00
|
|
|
|
"ντ'",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"π.Χ.",
|
|
|
|
|
"π.β.",
|
|
|
|
|
"π.δ.",
|
|
|
|
|
"π.μ.",
|
|
|
|
|
"π.χ.",
|
|
|
|
|
"σ.",
|
|
|
|
|
"σ.α.λ.",
|
|
|
|
|
"σ.σ.",
|
|
|
|
|
"σελ.",
|
|
|
|
|
"στρ.",
|
|
|
|
|
"τ'ς",
|
|
|
|
|
"τ.μ.",
|
|
|
|
|
"τετ.",
|
|
|
|
|
"τετρ.",
|
|
|
|
|
"τηλ.",
|
|
|
|
|
"τρισ.",
|
|
|
|
|
"τόν.",
|
2018-07-10 14:48:38 +03:00
|
|
|
|
"υπ.",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"χ.μ.",
|
|
|
|
|
"χγρ.",
|
|
|
|
|
"χιλ.",
|
|
|
|
|
"χλμ.",
|
2018-07-10 14:48:38 +03:00
|
|
|
|
]:
|
|
|
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
2020-07-22 23:18:46 +03:00
|
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|