2023-06-26 12:41:03 +03:00
|
|
|
from ...symbols import NORM, ORTH
|
2020-07-22 23:18:46 +03:00
|
|
|
from ...util import update_exc
|
2023-06-26 12:41:03 +03:00
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
2016-12-18 17:40:22 +03:00
|
|
|
|
2017-05-08 16:44:26 +03:00
|
|
|
_exc = {
|
2020-07-23 00:09:01 +03:00
|
|
|
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
|
|
|
"du's": [{ORTH: "du"}, {ORTH: "'s", NORM: "es"}],
|
|
|
|
"er's": [{ORTH: "er"}, {ORTH: "'s", NORM: "es"}],
|
|
|
|
"hinter'm": [{ORTH: "hinter"}, {ORTH: "'m", NORM: "dem"}],
|
|
|
|
"ich's": [{ORTH: "ich"}, {ORTH: "'s", NORM: "es"}],
|
|
|
|
"ihr's": [{ORTH: "ihr"}, {ORTH: "'s", NORM: "es"}],
|
|
|
|
"sie's": [{ORTH: "sie"}, {ORTH: "'s", NORM: "es"}],
|
|
|
|
"unter'm": [{ORTH: "unter"}, {ORTH: "'m", NORM: "dem"}],
|
|
|
|
"vor'm": [{ORTH: "vor"}, {ORTH: "'m", NORM: "dem"}],
|
|
|
|
"wir's": [{ORTH: "wir"}, {ORTH: "'s", NORM: "es"}],
|
|
|
|
"über'm": [{ORTH: "über"}, {ORTH: "'m", NORM: "dem"}],
|
2016-12-18 17:40:22 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-08 16:44:26 +03:00
|
|
|
for exc_data in [
|
2020-07-23 00:09:01 +03:00
|
|
|
{ORTH: "'S", NORM: "'s"},
|
|
|
|
{ORTH: "'s", NORM: "'s"},
|
|
|
|
{ORTH: "S'", NORM: "'s"},
|
|
|
|
{ORTH: "s'", NORM: "'s"},
|
|
|
|
{ORTH: "'n", NORM: "ein"},
|
|
|
|
{ORTH: "'ne", NORM: "eine"},
|
|
|
|
{ORTH: "'nen", NORM: "einen"},
|
|
|
|
{ORTH: "'nem", NORM: "einem"},
|
|
|
|
{ORTH: "Abb.", NORM: "Abbildung"},
|
|
|
|
{ORTH: "Abk.", NORM: "Abkürzung"},
|
|
|
|
{ORTH: "Abt.", NORM: "Abteilung"},
|
|
|
|
{ORTH: "Apr.", NORM: "April"},
|
|
|
|
{ORTH: "Aug.", NORM: "August"},
|
|
|
|
{ORTH: "Bd.", NORM: "Band"},
|
|
|
|
{ORTH: "Betr.", NORM: "Betreff"},
|
|
|
|
{ORTH: "Bf.", NORM: "Bahnhof"},
|
|
|
|
{ORTH: "Bhf.", NORM: "Bahnhof"},
|
|
|
|
{ORTH: "Bsp.", NORM: "Beispiel"},
|
|
|
|
{ORTH: "Dez.", NORM: "Dezember"},
|
|
|
|
{ORTH: "Di.", NORM: "Dienstag"},
|
|
|
|
{ORTH: "Do.", NORM: "Donnerstag"},
|
|
|
|
{ORTH: "Fa.", NORM: "Firma"},
|
|
|
|
{ORTH: "Fam.", NORM: "Familie"},
|
|
|
|
{ORTH: "Feb.", NORM: "Februar"},
|
|
|
|
{ORTH: "Fr.", NORM: "Frau"},
|
|
|
|
{ORTH: "Frl.", NORM: "Fräulein"},
|
|
|
|
{ORTH: "Hbf.", NORM: "Hauptbahnhof"},
|
|
|
|
{ORTH: "Hr.", NORM: "Herr"},
|
|
|
|
{ORTH: "Hrn.", NORM: "Herrn"},
|
|
|
|
{ORTH: "Jan.", NORM: "Januar"},
|
|
|
|
{ORTH: "Jh.", NORM: "Jahrhundert"},
|
|
|
|
{ORTH: "Jhd.", NORM: "Jahrhundert"},
|
|
|
|
{ORTH: "Jul.", NORM: "Juli"},
|
|
|
|
{ORTH: "Jun.", NORM: "Juni"},
|
|
|
|
{ORTH: "Mi.", NORM: "Mittwoch"},
|
|
|
|
{ORTH: "Mio.", NORM: "Million"},
|
|
|
|
{ORTH: "Mo.", NORM: "Montag"},
|
|
|
|
{ORTH: "Mrd.", NORM: "Milliarde"},
|
|
|
|
{ORTH: "Mrz.", NORM: "März"},
|
|
|
|
{ORTH: "MwSt.", NORM: "Mehrwertsteuer"},
|
|
|
|
{ORTH: "Mär.", NORM: "März"},
|
|
|
|
{ORTH: "Nov.", NORM: "November"},
|
|
|
|
{ORTH: "Nr.", NORM: "Nummer"},
|
|
|
|
{ORTH: "Okt.", NORM: "Oktober"},
|
|
|
|
{ORTH: "Orig.", NORM: "Original"},
|
|
|
|
{ORTH: "Pkt.", NORM: "Punkt"},
|
|
|
|
{ORTH: "Prof.", NORM: "Professor"},
|
|
|
|
{ORTH: "Red.", NORM: "Redaktion"},
|
|
|
|
{ORTH: "Sa.", NORM: "Samstag"},
|
|
|
|
{ORTH: "Sep.", NORM: "September"},
|
|
|
|
{ORTH: "Sept.", NORM: "September"},
|
|
|
|
{ORTH: "So.", NORM: "Sonntag"},
|
|
|
|
{ORTH: "Std.", NORM: "Stunde"},
|
|
|
|
{ORTH: "Str.", NORM: "Straße"},
|
|
|
|
{ORTH: "Tel.", NORM: "Telefon"},
|
|
|
|
{ORTH: "Tsd.", NORM: "Tausend"},
|
|
|
|
{ORTH: "Univ.", NORM: "Universität"},
|
|
|
|
{ORTH: "abzgl.", NORM: "abzüglich"},
|
|
|
|
{ORTH: "allg.", NORM: "allgemein"},
|
|
|
|
{ORTH: "bspw.", NORM: "beispielsweise"},
|
|
|
|
{ORTH: "bzgl.", NORM: "bezüglich"},
|
|
|
|
{ORTH: "bzw.", NORM: "beziehungsweise"},
|
|
|
|
{ORTH: "d.h."},
|
|
|
|
{ORTH: "dgl.", NORM: "dergleichen"},
|
|
|
|
{ORTH: "ebd.", NORM: "ebenda"},
|
|
|
|
{ORTH: "eigtl.", NORM: "eigentlich"},
|
|
|
|
{ORTH: "engl.", NORM: "englisch"},
|
|
|
|
{ORTH: "evtl.", NORM: "eventuell"},
|
|
|
|
{ORTH: "frz.", NORM: "französisch"},
|
|
|
|
{ORTH: "gegr.", NORM: "gegründet"},
|
|
|
|
{ORTH: "ggf.", NORM: "gegebenenfalls"},
|
|
|
|
{ORTH: "ggfs.", NORM: "gegebenenfalls"},
|
|
|
|
{ORTH: "ggü.", NORM: "gegenüber"},
|
|
|
|
{ORTH: "i.O."},
|
|
|
|
{ORTH: "i.d.R."},
|
|
|
|
{ORTH: "incl.", NORM: "inklusive"},
|
|
|
|
{ORTH: "inkl.", NORM: "inklusive"},
|
|
|
|
{ORTH: "insb.", NORM: "insbesondere"},
|
|
|
|
{ORTH: "kath.", NORM: "katholisch"},
|
|
|
|
{ORTH: "lt.", NORM: "laut"},
|
|
|
|
{ORTH: "max.", NORM: "maximal"},
|
|
|
|
{ORTH: "min.", NORM: "minimal"},
|
|
|
|
{ORTH: "mind.", NORM: "mindestens"},
|
|
|
|
{ORTH: "mtl.", NORM: "monatlich"},
|
|
|
|
{ORTH: "n.Chr."},
|
|
|
|
{ORTH: "orig.", NORM: "original"},
|
|
|
|
{ORTH: "röm.", NORM: "römisch"},
|
|
|
|
{ORTH: "s.o."},
|
|
|
|
{ORTH: "sog."},
|
|
|
|
{ORTH: "stellv."},
|
|
|
|
{ORTH: "tägl.", NORM: "täglich"},
|
|
|
|
{ORTH: "u.U."},
|
|
|
|
{ORTH: "u.s.w."},
|
|
|
|
{ORTH: "u.v.m."},
|
|
|
|
{ORTH: "usf."},
|
|
|
|
{ORTH: "usw."},
|
|
|
|
{ORTH: "uvm."},
|
|
|
|
{ORTH: "v.Chr."},
|
|
|
|
{ORTH: "v.a."},
|
|
|
|
{ORTH: "v.l.n.r."},
|
|
|
|
{ORTH: "vgl.", NORM: "vergleiche"},
|
|
|
|
{ORTH: "vllt.", NORM: "vielleicht"},
|
|
|
|
{ORTH: "vlt.", NORM: "vielleicht"},
|
|
|
|
{ORTH: "z.B."},
|
|
|
|
{ORTH: "z.Bsp."},
|
|
|
|
{ORTH: "z.T."},
|
|
|
|
{ORTH: "z.Z."},
|
|
|
|
{ORTH: "z.Zt."},
|
|
|
|
{ORTH: "z.b."},
|
|
|
|
{ORTH: "zzgl."},
|
|
|
|
{ORTH: "österr.", NORM: "österreichisch"},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
]:
|
2017-11-02 01:02:45 +03:00
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
2017-05-08 16:44:26 +03:00
|
|
|
|
|
|
|
|
|
|
|
for orth in [
|
2020-02-26 15:06:52 +03:00
|
|
|
"``",
|
|
|
|
"''",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"A.C.",
|
|
|
|
"a.D.",
|
|
|
|
"A.D.",
|
|
|
|
"A.G.",
|
|
|
|
"a.M.",
|
|
|
|
"a.Z.",
|
|
|
|
"Abs.",
|
|
|
|
"adv.",
|
|
|
|
"al.",
|
|
|
|
"B.A.",
|
|
|
|
"B.Sc.",
|
|
|
|
"betr.",
|
|
|
|
"biol.",
|
|
|
|
"Biol.",
|
|
|
|
"ca.",
|
2020-02-26 15:06:52 +03:00
|
|
|
"CDU/CSU",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"Chr.",
|
|
|
|
"Cie.",
|
2020-02-26 15:06:52 +03:00
|
|
|
"c/o",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"co.",
|
|
|
|
"Co.",
|
2020-02-26 15:06:52 +03:00
|
|
|
"d'",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"D.C.",
|
|
|
|
"Dipl.-Ing.",
|
|
|
|
"Dipl.",
|
|
|
|
"Dr.",
|
|
|
|
"e.g.",
|
|
|
|
"e.V.",
|
|
|
|
"ehem.",
|
|
|
|
"entspr.",
|
|
|
|
"erm.",
|
|
|
|
"etc.",
|
|
|
|
"ev.",
|
|
|
|
"G.m.b.H.",
|
|
|
|
"geb.",
|
|
|
|
"Gebr.",
|
|
|
|
"gem.",
|
|
|
|
"h.c.",
|
|
|
|
"Hg.",
|
|
|
|
"hrsg.",
|
|
|
|
"Hrsg.",
|
|
|
|
"i.A.",
|
|
|
|
"i.e.",
|
|
|
|
"i.G.",
|
|
|
|
"i.Tr.",
|
|
|
|
"i.V.",
|
2020-02-26 15:06:52 +03:00
|
|
|
"I.",
|
|
|
|
"II.",
|
|
|
|
"III.",
|
|
|
|
"IV.",
|
|
|
|
"Inc.",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"Ing.",
|
|
|
|
"jr.",
|
|
|
|
"Jr.",
|
|
|
|
"jun.",
|
|
|
|
"jur.",
|
|
|
|
"K.O.",
|
2020-02-26 15:06:52 +03:00
|
|
|
"L'",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"L.A.",
|
|
|
|
"lat.",
|
|
|
|
"M.A.",
|
|
|
|
"m.E.",
|
|
|
|
"m.M.",
|
|
|
|
"M.Sc.",
|
|
|
|
"Mr.",
|
|
|
|
"N.Y.",
|
|
|
|
"N.Y.C.",
|
|
|
|
"nat.",
|
|
|
|
"o.a.",
|
|
|
|
"o.ä.",
|
|
|
|
"o.g.",
|
|
|
|
"o.k.",
|
|
|
|
"O.K.",
|
|
|
|
"p.a.",
|
|
|
|
"p.s.",
|
|
|
|
"P.S.",
|
|
|
|
"pers.",
|
|
|
|
"phil.",
|
|
|
|
"q.e.d.",
|
|
|
|
"R.I.P.",
|
|
|
|
"rer.",
|
|
|
|
"sen.",
|
|
|
|
"St.",
|
|
|
|
"std.",
|
|
|
|
"u.a.",
|
|
|
|
"U.S.",
|
|
|
|
"U.S.A.",
|
|
|
|
"U.S.S.",
|
|
|
|
"Vol.",
|
|
|
|
"vs.",
|
|
|
|
"wiss.",
|
|
|
|
]:
|
2017-05-08 16:44:26 +03:00
|
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
|
|
|
2020-07-22 23:18:46 +03:00
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|