mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
428887b8f2
* new language: Luxembourgish (lb) * update * update * Update and rename .github/CONTRIBUTOR_AGREEMENT.md to .github/contributors/PeterGilles.md * Update and rename .github/contributors/PeterGilles.md to .github/CONTRIBUTOR_AGREEMENT.md * Update norm_exceptions.py * Delete README.md * moved test_lemma.py * deactivated 'lemma_lookup = LOOKUP' * update * Update conftest.py * update * tests updated * import unicode_literals * Update spacy/tests/lang/lb/test_text.py Co-Authored-By: Ines Montani <ines@ines.io> * Create PeterGilles.md
48 lines
1.7 KiB
Python
48 lines
1.7 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
|
||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||
from ..punctuation import TOKENIZER_PREFIXES
|
||
|
||
# TODO
|
||
# tokenize cliticised definite article "d'" as token of its own: d'Kanner > [d'] [Kanner]
|
||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||
|
||
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
|
||
_prefixes = [prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’", r"\' "]]
|
||
|
||
|
||
_exc = {
|
||
"d'mannst": [
|
||
{ORTH: "d'", LEMMA: "d'"},
|
||
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"}],
|
||
"d'éischt": [
|
||
{ORTH: "d'", LEMMA: "d'"},
|
||
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}]
|
||
}
|
||
|
||
# translate / delete what is not necessary
|
||
# what does PRON_LEMMA mean?
|
||
for exc_data in [
|
||
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
|
||
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
|
||
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
|
||
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
|
||
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
|
||
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
||
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
||
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}]:
|
||
_exc[exc_data[ORTH]] = [exc_data]
|
||
|
||
|
||
# to be extended
|
||
for orth in [
|
||
"z.B.", "Dipl.", "Dr.", "etc.", "i.e.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "phil.",
|
||
"q.e.d.", "R.I.P.", "rer.", "sen.", "ë.a.", "U.S.", "U.S.A."]:
|
||
_exc[orth] = [{ORTH: orth}]
|
||
|
||
|
||
TOKENIZER_PREFIXES = _prefixes
|
||
TOKENIZER_EXCEPTIONS = _exc
|