From 1eb7cc3017a6def34fb448781578888764d1e659 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 21:24:55 +0100
Subject: [PATCH] attempt a port from #1147

---
 spacy/lang/ga/__init__.py             |  24 ++++++
 spacy/lang/ga/stop_words.py           |  45 ++++++++++
 spacy/lang/ga/tokenizer_exceptions.py | 115 ++++++++++++++++++++++++++
 3 files changed, 184 insertions(+)
 create mode 100644 spacy/lang/ga/__init__.py
 create mode 100644 spacy/lang/ga/stop_words.py
 create mode 100644 spacy/lang/ga/tokenizer_exceptions.py

diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
new file mode 100644
index 000000000..8231cc925
--- /dev/null
+++ b/spacy/lang/ga/__init__.py
@@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class Irish(Language):
+    lang = 'nb'
+
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'ga'
+
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+        stop_words = set(STOP_WORDS)
+
+
+__all__ = ['Irish']
diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py
new file mode 100644
index 000000000..816c00b13
--- /dev/null
+++ b/spacy/lang/ga/stop_words.py
@@ -0,0 +1,45 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set("""
+a ach ag agus an aon ar arna as
+
+ba beirt bhúr
+
+caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear
+
+daichead dar de deich deichniúr den dhá do don dtí dá dár dó
+
+faoi faoin faoina faoinár fara fiche
+
+gach gan go gur
+
+haon hocht
+
+i iad idir in ina ins inár is
+
+le leis lena lenár
+
+mar mo muid mé
+
+na nach naoi naonúr ná ní níor nó nócha
+
+ocht ochtar ochtó os
+
+roimh
+
+sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí
+
+tar thar thú triúr trí trína trínár tríocha tú
+
+um
+
+ár
+
+é éis
+
+í
+
+ó ón óna ónár
+""".split())
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
new file mode 100644
index 000000000..ce280a3a2
--- /dev/null
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -0,0 +1,115 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, NORM
+
+
+_exc = {
+    "'acha'n": [
+        {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+        {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}],
+
+    "dem'": [
+        {ORTH: "de", LEMMA: "de", NORM: "de"},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+
+    "ded'": [
+        {ORTH: "de", LEMMA: "de", NORM: "de"},
+        {ORTH: "d'", LEMMA: "do", NORM: "do"}],
+
+    "lem'": [
+        {ORTH: "le", LEMMA: "le", NORM: "le"},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+
+    "led'": [
+        {ORTH: "le", LEMMA: "le", NORM: "le"},
+        {ORTH: "d'", LEMMA: "mo", NORM: "do"}],
+
+    "a.C.n.": [
+        {ORTH: "a.", LEMMA: "ante"},
+        {ORTH: "C.", LEMMA: "Christum"},
+        {ORTH: "n.", LEMMA: "natum"}],
+
+    "m.sh.": [
+        {ORTH: "m.", LEMMA: "mar"},
+        {ORTH: "sh.", LEMMA: "sampla"}],
+
+    "M.F.": [
+        {ORTH: "M.", LEMMA: "Meán"},
+        {ORTH: "F.", LEMMA: "Fómhar"}],
+
+    "M.Fómh.": [
+        {ORTH: "M.", LEMMA: "Meán"},
+        {ORTH: "Fómh.", LEMMA: "Fómhar"}],
+
+    "R.C.": [
+        {ORTH: "Rr.", LEMMA: "roimh"},
+        {ORTH: "C.", LEMMA: "Críost"}],
+
+    "r.Ch.": [
+        {ORTH: "r.", LEMMA: "roimh"},
+        {ORTH: "Ch.", LEMMA: "Críost"}],
+
+    "r.Chr.": [
+        {ORTH: "r.", LEMMA: "roimh"},
+        {ORTH: "Chr.", LEMMA: "Críost"}],
+
+    "R.Ch.": [
+        {ORTH: "R.", LEMMA: "roimh"},
+        {ORTH: "Ch.", LEMMA: "Críost"}],
+
+    "R.Chr.": [
+        {ORTH: "R.", LEMMA: "roimh"},
+        {ORTH: "Chr.", LEMMA: "Críost"}],
+
+    "⁊rl.": [
+        {ORTH: "⁊", LEMMA: "agus"},
+        {ORTH: "rl.", LEMMA: "araile"}],
+
+    "srl.": [
+        {ORTH: "s", LEMMA: "agus"},
+        {ORTH: "rl.", LEMMA: "araile"}],
+
+}
+
+for exc_data in [
+    {ORTH: "'gus", LEMMA: "agus", NORM: "agus"},
+    {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+    {ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
+    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"},
+    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"},
+    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"},
+    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"},
+    {ORTH: "m'", LEMMA: "mo"},,
+    {ORTH: "Aib.", LEMMA: "Aibreán"},
+    {ORTH: "Ath.", LEMMA: "athair"},
+    {ORTH: "Beal.", LEMMA: "Bealtaine"},
+    {ORTH: "Co.", LEMMA: "contae"},
+    {ORTH: "Ean.", LEMMA: "Eanáir"},
+    {ORTH: "Feab.", LEMMA: "Feabhra"},
+    {ORTH: "gCo.", LEMMA: "contae"},
+    {ORTH: ".i.", LEMMA: "eadhon"},
+    {ORTH: "lch.", LEMMA: "leathanach"},
+    {ORTH: "Lch.", LEMMA: "leathanach"},
+    {ORTH: "lgh.", LEMMA: "leathanach"},
+    {ORTH: "Lgh.", LEMMA: "leathanach"},
+    {ORTH: "Lún.", LEMMA: "Lúnasa"},
+    {ORTH: "Már.", LEMMA: "Márta"},
+    {ORTH: "Meith.", LEMMA: "Meitheamh"},
+    {ORTH: "Noll.", LEMMA: "Nollaig"},
+    {ORTH: "Samh.", LEMMA: "Samhain"},
+    {ORTH: "tAth.", LEMMA: "athair"},
+    {ORTH: "tUas.", LEMMA: "Uasal"},
+    {ORTH: "teo.", LEMMA: "teoranta"},
+    {ORTH: "Teo.", LEMMA: "teoranta"},
+    {ORTH: "Uas.", LEMMA: "Uasal"},
+    {ORTH: "uimh.", LEMMA: "uimhir"},
+    {ORTH: "Uimh.", LEMMA: "uimhir"}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "d'"]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)