From 1eb7cc3017a6def34fb448781578888764d1e659 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:24:55 +0100 Subject: [PATCH] attempt a port from #1147 --- spacy/lang/ga/__init__.py | 24 ++++++ spacy/lang/ga/stop_words.py | 45 ++++++++++ spacy/lang/ga/tokenizer_exceptions.py | 115 ++++++++++++++++++++++++++ 3 files changed, 184 insertions(+) create mode 100644 spacy/lang/ga/__init__.py create mode 100644 spacy/lang/ga/stop_words.py create mode 100644 spacy/lang/ga/tokenizer_exceptions.py diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py new file mode 100644 index 000000000..8231cc925 --- /dev/null +++ b/spacy/lang/ga/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class Irish(Language): + lang = 'nb' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ga' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + +__all__ = ['Irish'] diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py new file mode 100644 index 000000000..816c00b13 --- /dev/null +++ b/spacy/lang/ga/stop_words.py @@ -0,0 +1,45 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a ach ag agus an aon ar arna as + +ba beirt bhúr + +caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear + +daichead dar de deich deichniúr den dhá do don dtí dá dár dó + +faoi faoin faoina faoinár fara fiche + +gach gan go gur + +haon hocht + +i iad idir in ina ins inár is + +le leis lena lenár + +mar mo muid mé + +na nach naoi naonúr ná ní níor nó nócha + +ocht ochtar ochtó os + +roimh + +sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí + +tar thar thú triúr trí trína trínár tríocha tú + +um + +ár + +é éis + +í + +ó ón óna ónár +""".split()) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py new file mode 100644 index 000000000..ce280a3a2 --- /dev/null +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -0,0 +1,115 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import ORTH, LEMMA, NORM + + +_exc = { + "'acha'n": [ + {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}], + + "dem'": [ + {ORTH: "de", LEMMA: "de", NORM: "de"}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + + "ded'": [ + {ORTH: "de", LEMMA: "de", NORM: "de"}, + {ORTH: "d'", LEMMA: "do", NORM: "do"}], + + "lem'": [ + {ORTH: "le", LEMMA: "le", NORM: "le"}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + + "led'": [ + {ORTH: "le", LEMMA: "le", NORM: "le"}, + {ORTH: "d'", LEMMA: "mo", NORM: "do"}], + + "a.C.n.": [ + {ORTH: "a.", LEMMA: "ante"}, + {ORTH: "C.", LEMMA: "Christum"}, + {ORTH: "n.", LEMMA: "natum"}], + + "m.sh.": [ + {ORTH: "m.", LEMMA: "mar"}, + {ORTH: "sh.", LEMMA: "sampla"}], + + "M.F.": [ + {ORTH: "M.", LEMMA: "Meán"}, + {ORTH: "F.", LEMMA: "Fómhar"}], + + "M.Fómh.": [ + {ORTH: "M.", LEMMA: "Meán"}, + {ORTH: "Fómh.", LEMMA: "Fómhar"}], + + "R.C.": [ + {ORTH: "Rr.", LEMMA: "roimh"}, + {ORTH: "C.", LEMMA: "Críost"}], + + "r.Ch.": [ + {ORTH: "r.", LEMMA: "roimh"}, + {ORTH: "Ch.", LEMMA: "Críost"}], + + "r.Chr.": [ + {ORTH: "r.", LEMMA: "roimh"}, + {ORTH: "Chr.", LEMMA: "Críost"}], + + "R.Ch.": [ + {ORTH: "R.", LEMMA: "roimh"}, + {ORTH: "Ch.", LEMMA: "Críost"}], + + "R.Chr.": [ + {ORTH: "R.", LEMMA: "roimh"}, + {ORTH: "Chr.", LEMMA: "Críost"}], + + "⁊rl.": [ + {ORTH: "⁊", LEMMA: "agus"}, + {ORTH: "rl.", LEMMA: "araile"}], + + "srl.": [ + {ORTH: "s", LEMMA: "agus"}, + {ORTH: "rl.", LEMMA: "araile"}], + +} + +for exc_data in [ + {ORTH: "'gus", LEMMA: "agus", NORM: "agus"}, + {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, + {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"}, + {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"}, + {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"}, + {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"}, + {ORTH: "m'", LEMMA: "mo"},, + {ORTH: "Aib.", LEMMA: "Aibreán"}, + {ORTH: "Ath.", LEMMA: "athair"}, + {ORTH: "Beal.", LEMMA: "Bealtaine"}, + {ORTH: "Co.", LEMMA: "contae"}, + {ORTH: "Ean.", LEMMA: "Eanáir"}, + {ORTH: "Feab.", LEMMA: "Feabhra"}, + {ORTH: "gCo.", LEMMA: "contae"}, + {ORTH: ".i.", LEMMA: "eadhon"}, + {ORTH: "lch.", LEMMA: "leathanach"}, + {ORTH: "Lch.", LEMMA: "leathanach"}, + {ORTH: "lgh.", LEMMA: "leathanach"}, + {ORTH: "Lgh.", LEMMA: "leathanach"}, + {ORTH: "Lún.", LEMMA: "Lúnasa"}, + {ORTH: "Már.", LEMMA: "Márta"}, + {ORTH: "Meith.", LEMMA: "Meitheamh"}, + {ORTH: "Noll.", LEMMA: "Nollaig"}, + {ORTH: "Samh.", LEMMA: "Samhain"}, + {ORTH: "tAth.", LEMMA: "athair"}, + {ORTH: "tUas.", LEMMA: "Uasal"}, + {ORTH: "teo.", LEMMA: "teoranta"}, + {ORTH: "Teo.", LEMMA: "teoranta"}, + {ORTH: "Uas.", LEMMA: "Uasal"}, + {ORTH: "uimh.", LEMMA: "uimhir"}, + {ORTH: "Uimh.", LEMMA: "uimhir"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)], + +for orth in [ + "d'"]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = dict(_exc)