attempt a port from #1147

This commit is contained in:
Jim O'Regan 2017-06-26 21:24:55 +01:00
parent 593361ee3c
commit 1eb7cc3017
3 changed files with 184 additions and 0 deletions

24
spacy/lang/ga/__init__.py Normal file
View File

@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class Irish(Language):
lang = 'nb'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ga'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Irish']

View File

@ -0,0 +1,45 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
a ach ag agus an aon ar arna as
ba beirt bhúr
caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear
daichead dar de deich deichniúr den dhá do don dtí dár
faoi faoin faoina faoinár fara fiche
gach gan go gur
haon hocht
i iad idir in ina ins inár is
le leis lena lenár
mar mo muid
na nach naoi naonúr níor nócha
ocht ochtar ochtó os
roimh
sa seacht seachtar seachtó seasca seisear siad sibh sinn sna
tar thar thú triúr trí trína trínár tríocha
um
ár
é éis
í
ó ón óna ónár
""".split())

View File

@ -0,0 +1,115 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import ORTH, LEMMA, NORM
_exc = {
"'acha'n": [
{ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
{ORTH: "a'n", LEMMA: "aon", NORM: "aon"}],
"dem'": [
{ORTH: "de", LEMMA: "de", NORM: "de"},
{ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
"ded'": [
{ORTH: "de", LEMMA: "de", NORM: "de"},
{ORTH: "d'", LEMMA: "do", NORM: "do"}],
"lem'": [
{ORTH: "le", LEMMA: "le", NORM: "le"},
{ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
"led'": [
{ORTH: "le", LEMMA: "le", NORM: "le"},
{ORTH: "d'", LEMMA: "mo", NORM: "do"}],
"a.C.n.": [
{ORTH: "a.", LEMMA: "ante"},
{ORTH: "C.", LEMMA: "Christum"},
{ORTH: "n.", LEMMA: "natum"}],
"m.sh.": [
{ORTH: "m.", LEMMA: "mar"},
{ORTH: "sh.", LEMMA: "sampla"}],
"M.F.": [
{ORTH: "M.", LEMMA: "Meán"},
{ORTH: "F.", LEMMA: "Fómhar"}],
"M.Fómh.": [
{ORTH: "M.", LEMMA: "Meán"},
{ORTH: "Fómh.", LEMMA: "Fómhar"}],
"R.C.": [
{ORTH: "Rr.", LEMMA: "roimh"},
{ORTH: "C.", LEMMA: "Críost"}],
"r.Ch.": [
{ORTH: "r.", LEMMA: "roimh"},
{ORTH: "Ch.", LEMMA: "Críost"}],
"r.Chr.": [
{ORTH: "r.", LEMMA: "roimh"},
{ORTH: "Chr.", LEMMA: "Críost"}],
"R.Ch.": [
{ORTH: "R.", LEMMA: "roimh"},
{ORTH: "Ch.", LEMMA: "Críost"}],
"R.Chr.": [
{ORTH: "R.", LEMMA: "roimh"},
{ORTH: "Chr.", LEMMA: "Críost"}],
"⁊rl.": [
{ORTH: "", LEMMA: "agus"},
{ORTH: "rl.", LEMMA: "araile"}],
"srl.": [
{ORTH: "s", LEMMA: "agus"},
{ORTH: "rl.", LEMMA: "araile"}],
}
for exc_data in [
{ORTH: "'gus", LEMMA: "agus", NORM: "agus"},
{ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
{ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
{ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"},
{ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"},
{ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"},
{ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"},
{ORTH: "m'", LEMMA: "mo"},,
{ORTH: "Aib.", LEMMA: "Aibreán"},
{ORTH: "Ath.", LEMMA: "athair"},
{ORTH: "Beal.", LEMMA: "Bealtaine"},
{ORTH: "Co.", LEMMA: "contae"},
{ORTH: "Ean.", LEMMA: "Eanáir"},
{ORTH: "Feab.", LEMMA: "Feabhra"},
{ORTH: "gCo.", LEMMA: "contae"},
{ORTH: ".i.", LEMMA: "eadhon"},
{ORTH: "lch.", LEMMA: "leathanach"},
{ORTH: "Lch.", LEMMA: "leathanach"},
{ORTH: "lgh.", LEMMA: "leathanach"},
{ORTH: "Lgh.", LEMMA: "leathanach"},
{ORTH: "Lún.", LEMMA: "Lúnasa"},
{ORTH: "Már.", LEMMA: "Márta"},
{ORTH: "Meith.", LEMMA: "Meitheamh"},
{ORTH: "Noll.", LEMMA: "Nollaig"},
{ORTH: "Samh.", LEMMA: "Samhain"},
{ORTH: "tAth.", LEMMA: "athair"},
{ORTH: "tUas.", LEMMA: "Uasal"},
{ORTH: "teo.", LEMMA: "teoranta"},
{ORTH: "Teo.", LEMMA: "teoranta"},
{ORTH: "Uas.", LEMMA: "Uasal"},
{ORTH: "uimh.", LEMMA: "uimhir"},
{ORTH: "Uimh.", LEMMA: "uimhir"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)],
for orth in [
"d'"]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)