attempt a port from #1147

2025-08-09 22:54:53 +03:00 · 2017-06-26 21:24:55 +01:00 · 2017-06-26 21:24:55 +01:00 · 1eb7cc3017
commit 1eb7cc3017
parent 593361ee3c
3 changed files with 184 additions and 0 deletions
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class Irish(Language):
+    lang = 'nb'
+
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'ga'
+
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+        stop_words = set(STOP_WORDS)
+
+
+__all__ = ['Irish']
--- a/spacy/lang/ga/stop_words.py
+++ b/spacy/lang/ga/stop_words.py
@ -0,0 +1,45 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set("""
+a ach ag agus an aon ar arna as
+
+ba beirt bhúr
+
+caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear
+
+daichead dar de deich deichniúr den dhá do don dtí dá dár dó
+
+faoi faoin faoina faoinár fara fiche
+
+gach gan go gur
+
+haon hocht
+
+i iad idir in ina ins inár is
+
+le leis lena lenár
+
+mar mo muid mé
+
+na nach naoi naonúr ná ní níor nó nócha
+
+ocht ochtar ochtó os
+
+roimh
+
+sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí
+
+tar thar thú triúr trí trína trínár tríocha tú
+
+um
+
+ár
+
+é éis
+
+í
+
+ó ón óna ónár
+""".split())
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@ -0,0 +1,115 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, NORM
+
+
+_exc = {
+    "'acha'n": [
+        {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+        {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}],
+
+    "dem'": [
+        {ORTH: "de", LEMMA: "de", NORM: "de"},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+
+    "ded'": [
+        {ORTH: "de", LEMMA: "de", NORM: "de"},
+        {ORTH: "d'", LEMMA: "do", NORM: "do"}],
+
+    "lem'": [
+        {ORTH: "le", LEMMA: "le", NORM: "le"},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+
+    "led'": [
+        {ORTH: "le", LEMMA: "le", NORM: "le"},
+        {ORTH: "d'", LEMMA: "mo", NORM: "do"}],
+
+    "a.C.n.": [
+        {ORTH: "a.", LEMMA: "ante"},
+        {ORTH: "C.", LEMMA: "Christum"},
+        {ORTH: "n.", LEMMA: "natum"}],
+
+    "m.sh.": [
+        {ORTH: "m.", LEMMA: "mar"},
+        {ORTH: "sh.", LEMMA: "sampla"}],
+
+    "M.F.": [
+        {ORTH: "M.", LEMMA: "Meán"},
+        {ORTH: "F.", LEMMA: "Fómhar"}],
+
+    "M.Fómh.": [
+        {ORTH: "M.", LEMMA: "Meán"},
+        {ORTH: "Fómh.", LEMMA: "Fómhar"}],
+
+    "R.C.": [
+        {ORTH: "Rr.", LEMMA: "roimh"},
+        {ORTH: "C.", LEMMA: "Críost"}],
+
+    "r.Ch.": [
+        {ORTH: "r.", LEMMA: "roimh"},
+        {ORTH: "Ch.", LEMMA: "Críost"}],
+
+    "r.Chr.": [
+        {ORTH: "r.", LEMMA: "roimh"},
+        {ORTH: "Chr.", LEMMA: "Críost"}],
+
+    "R.Ch.": [
+        {ORTH: "R.", LEMMA: "roimh"},
+        {ORTH: "Ch.", LEMMA: "Críost"}],
+
+    "R.Chr.": [
+        {ORTH: "R.", LEMMA: "roimh"},
+        {ORTH: "Chr.", LEMMA: "Críost"}],
+
+    "⁊rl.": [
+        {ORTH: "⁊", LEMMA: "agus"},
+        {ORTH: "rl.", LEMMA: "araile"}],
+
+    "srl.": [
+        {ORTH: "s", LEMMA: "agus"},
+        {ORTH: "rl.", LEMMA: "araile"}],
+
+}
+
+for exc_data in [
+    {ORTH: "'gus", LEMMA: "agus", NORM: "agus"},
+    {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+    {ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
+    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"},
+    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"},
+    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"},
+    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"},
+    {ORTH: "m'", LEMMA: "mo"},,
+    {ORTH: "Aib.", LEMMA: "Aibreán"},
+    {ORTH: "Ath.", LEMMA: "athair"},
+    {ORTH: "Beal.", LEMMA: "Bealtaine"},
+    {ORTH: "Co.", LEMMA: "contae"},
+    {ORTH: "Ean.", LEMMA: "Eanáir"},
+    {ORTH: "Feab.", LEMMA: "Feabhra"},
+    {ORTH: "gCo.", LEMMA: "contae"},
+    {ORTH: ".i.", LEMMA: "eadhon"},
+    {ORTH: "lch.", LEMMA: "leathanach"},
+    {ORTH: "Lch.", LEMMA: "leathanach"},
+    {ORTH: "lgh.", LEMMA: "leathanach"},
+    {ORTH: "Lgh.", LEMMA: "leathanach"},
+    {ORTH: "Lún.", LEMMA: "Lúnasa"},
+    {ORTH: "Már.", LEMMA: "Márta"},
+    {ORTH: "Meith.", LEMMA: "Meitheamh"},
+    {ORTH: "Noll.", LEMMA: "Nollaig"},
+    {ORTH: "Samh.", LEMMA: "Samhain"},
+    {ORTH: "tAth.", LEMMA: "athair"},
+    {ORTH: "tUas.", LEMMA: "Uasal"},
+    {ORTH: "teo.", LEMMA: "teoranta"},
+    {ORTH: "Teo.", LEMMA: "teoranta"},
+    {ORTH: "Uas.", LEMMA: "Uasal"},
+    {ORTH: "uimh.", LEMMA: "uimhir"},
+    {ORTH: "Uimh.", LEMMA: "uimhir"}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "d'"]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)