From 1eb7cc3017a6def34fb448781578888764d1e659 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:24:55 +0100 Subject: [PATCH 01/25] attempt a port from #1147 --- spacy/lang/ga/__init__.py | 24 ++++++ spacy/lang/ga/stop_words.py | 45 ++++++++++ spacy/lang/ga/tokenizer_exceptions.py | 115 ++++++++++++++++++++++++++ 3 files changed, 184 insertions(+) create mode 100644 spacy/lang/ga/__init__.py create mode 100644 spacy/lang/ga/stop_words.py create mode 100644 spacy/lang/ga/tokenizer_exceptions.py diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py new file mode 100644 index 000000000..8231cc925 --- /dev/null +++ b/spacy/lang/ga/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class Irish(Language): + lang = 'nb' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ga' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + +__all__ = ['Irish'] diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py new file mode 100644 index 000000000..816c00b13 --- /dev/null +++ b/spacy/lang/ga/stop_words.py @@ -0,0 +1,45 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a ach ag agus an aon ar arna as + +ba beirt bhúr + +caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear + +daichead dar de deich deichniúr den dhá do don dtí dá dár dó + +faoi faoin faoina faoinár fara fiche + +gach gan go gur + +haon hocht + +i iad idir in ina ins inár is + +le leis lena lenár + +mar mo muid mé + +na nach naoi naonúr ná ní níor nó nócha + +ocht ochtar ochtó os + +roimh + +sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí + +tar thar thú triúr trí trína trínár tríocha tú + +um + +ár + +é éis + +í + +ó ón óna ónár +""".split()) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py new file mode 100644 index 000000000..ce280a3a2 --- /dev/null +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -0,0 +1,115 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import ORTH, LEMMA, NORM + + +_exc = { + "'acha'n": [ + {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}], + + "dem'": [ + {ORTH: "de", LEMMA: "de", NORM: "de"}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + + "ded'": [ + {ORTH: "de", LEMMA: "de", NORM: "de"}, + {ORTH: "d'", LEMMA: "do", NORM: "do"}], + + "lem'": [ + {ORTH: "le", LEMMA: "le", NORM: "le"}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + + "led'": [ + {ORTH: "le", LEMMA: "le", NORM: "le"}, + {ORTH: "d'", LEMMA: "mo", NORM: "do"}], + + "a.C.n.": [ + {ORTH: "a.", LEMMA: "ante"}, + {ORTH: "C.", LEMMA: "Christum"}, + {ORTH: "n.", LEMMA: "natum"}], + + "m.sh.": [ + {ORTH: "m.", LEMMA: "mar"}, + {ORTH: "sh.", LEMMA: "sampla"}], + + "M.F.": [ + {ORTH: "M.", LEMMA: "Meán"}, + {ORTH: "F.", LEMMA: "Fómhar"}], + + "M.Fómh.": [ + {ORTH: "M.", LEMMA: "Meán"}, + {ORTH: "Fómh.", LEMMA: "Fómhar"}], + + "R.C.": [ + {ORTH: "Rr.", LEMMA: "roimh"}, + {ORTH: "C.", LEMMA: "Críost"}], + + "r.Ch.": [ + {ORTH: "r.", LEMMA: "roimh"}, + {ORTH: "Ch.", LEMMA: "Críost"}], + + "r.Chr.": [ + {ORTH: "r.", LEMMA: "roimh"}, + {ORTH: "Chr.", LEMMA: "Críost"}], + + "R.Ch.": [ + {ORTH: "R.", LEMMA: "roimh"}, + {ORTH: "Ch.", LEMMA: "Críost"}], + + "R.Chr.": [ + {ORTH: "R.", LEMMA: "roimh"}, + {ORTH: "Chr.", LEMMA: "Críost"}], + + "⁊rl.": [ + {ORTH: "⁊", LEMMA: "agus"}, + {ORTH: "rl.", LEMMA: "araile"}], + + "srl.": [ + {ORTH: "s", LEMMA: "agus"}, + {ORTH: "rl.", LEMMA: "araile"}], + +} + +for exc_data in [ + {ORTH: "'gus", LEMMA: "agus", NORM: "agus"}, + {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, + {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"}, + {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"}, + {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"}, + {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"}, + {ORTH: "m'", LEMMA: "mo"},, + {ORTH: "Aib.", LEMMA: "Aibreán"}, + {ORTH: "Ath.", LEMMA: "athair"}, + {ORTH: "Beal.", LEMMA: "Bealtaine"}, + {ORTH: "Co.", LEMMA: "contae"}, + {ORTH: "Ean.", LEMMA: "Eanáir"}, + {ORTH: "Feab.", LEMMA: "Feabhra"}, + {ORTH: "gCo.", LEMMA: "contae"}, + {ORTH: ".i.", LEMMA: "eadhon"}, + {ORTH: "lch.", LEMMA: "leathanach"}, + {ORTH: "Lch.", LEMMA: "leathanach"}, + {ORTH: "lgh.", LEMMA: "leathanach"}, + {ORTH: "Lgh.", LEMMA: "leathanach"}, + {ORTH: "Lún.", LEMMA: "Lúnasa"}, + {ORTH: "Már.", LEMMA: "Márta"}, + {ORTH: "Meith.", LEMMA: "Meitheamh"}, + {ORTH: "Noll.", LEMMA: "Nollaig"}, + {ORTH: "Samh.", LEMMA: "Samhain"}, + {ORTH: "tAth.", LEMMA: "athair"}, + {ORTH: "tUas.", LEMMA: "Uasal"}, + {ORTH: "teo.", LEMMA: "teoranta"}, + {ORTH: "Teo.", LEMMA: "teoranta"}, + {ORTH: "Uas.", LEMMA: "Uasal"}, + {ORTH: "uimh.", LEMMA: "uimhir"}, + {ORTH: "Uimh.", LEMMA: "uimhir"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)], + +for orth in [ + "d'"]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = dict(_exc) From e9213f54deece142fff6c4ff0a2ae4106288f417 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:29:21 +0100 Subject: [PATCH 02/25] missed one --- spacy/lang/ga/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 8231cc925..7b72a8a91 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -11,7 +11,7 @@ from ...util import update_exc class Irish(Language): - lang = 'nb' + lang = 'ga' class Defaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) From 3c4d83aa6e634b19889338bdf3c0dfd593f9fdc6 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sat, 24 Jun 2017 22:29:02 +0100 Subject: [PATCH 03/25] CLA --- .github/contributors/jimregan.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/jimregan.md diff --git a/.github/contributors/jimregan.md b/.github/contributors/jimregan.md new file mode 100644 index 000000000..dd8fe3d64 --- /dev/null +++ b/.github/contributors/jimregan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jim O'Regan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-06-24 | +| GitHub username | jimregan | +| Website (optional) | | From a8dff9133e84671a3111390a8f4e8965ec744519 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:53:41 +0100 Subject: [PATCH 04/25] add POS --- spacy/lang/ga/tokenizer_exceptions.py | 130 +++++++++++++------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index ce280a3a2..3dca1c3d7 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,110 +1,110 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import ORTH, LEMMA, NORM +from ..symbols import ORTH, LEMMA, NORM, POS _exc = { "'acha'n": [ - {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, - {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}], + {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, + {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}], "dem'": [ - {ORTH: "de", LEMMA: "de", NORM: "de"}, - {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}], "ded'": [ - {ORTH: "de", LEMMA: "de", NORM: "de"}, - {ORTH: "d'", LEMMA: "do", NORM: "do"}], + {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, + {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET}], "lem'": [ - {ORTH: "le", LEMMA: "le", NORM: "le"}, - {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}], "led'": [ - {ORTH: "le", LEMMA: "le", NORM: "le"}, - {ORTH: "d'", LEMMA: "mo", NORM: "do"}], + {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, + {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}], "a.C.n.": [ - {ORTH: "a.", LEMMA: "ante"}, - {ORTH: "C.", LEMMA: "Christum"}, - {ORTH: "n.", LEMMA: "natum"}], + {ORTH: "a.", LEMMA: "ante", POS: X}, + {ORTH: "C.", LEMMA: "Christum", POS: X}, + {ORTH: "n.", LEMMA: "natum", POS: X}], "m.sh.": [ - {ORTH: "m.", LEMMA: "mar"}, - {ORTH: "sh.", LEMMA: "sampla"}], + {ORTH: "m.", LEMMA: "mar", POS: ADP}, + {ORTH: "sh.", LEMMA: "sampla", POS: NOUN}], "M.F.": [ - {ORTH: "M.", LEMMA: "Meán"}, - {ORTH: "F.", LEMMA: "Fómhar"}], + {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, + {ORTH: "F.", LEMMA: "Fómhar", POS: NOUN}], "M.Fómh.": [ - {ORTH: "M.", LEMMA: "Meán"}, - {ORTH: "Fómh.", LEMMA: "Fómhar"}], + {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, + {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}], "R.C.": [ - {ORTH: "Rr.", LEMMA: "roimh"}, - {ORTH: "C.", LEMMA: "Críost"}], + {ORTH: "Rr.", LEMMA: "roimh", POS: ADP}, + {ORTH: "C.", LEMMA: "Críost", POS: NOUN}], "r.Ch.": [ - {ORTH: "r.", LEMMA: "roimh"}, - {ORTH: "Ch.", LEMMA: "Críost"}], + {ORTH: "r.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], "r.Chr.": [ - {ORTH: "r.", LEMMA: "roimh"}, - {ORTH: "Chr.", LEMMA: "Críost"}], + {ORTH: "r.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], "R.Ch.": [ - {ORTH: "R.", LEMMA: "roimh"}, - {ORTH: "Ch.", LEMMA: "Críost"}], + {ORTH: "R.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], "R.Chr.": [ - {ORTH: "R.", LEMMA: "roimh"}, - {ORTH: "Chr.", LEMMA: "Críost"}], + {ORTH: "R.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], "⁊rl.": [ - {ORTH: "⁊", LEMMA: "agus"}, - {ORTH: "rl.", LEMMA: "araile"}], + {ORTH: "⁊", LEMMA: "agus", POS: CCONJ}, + {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], "srl.": [ - {ORTH: "s", LEMMA: "agus"}, - {ORTH: "rl.", LEMMA: "araile"}], + {ORTH: "s", LEMMA: "agus", POS: CCONJ}, + {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], } for exc_data in [ - {ORTH: "'gus", LEMMA: "agus", NORM: "agus"}, - {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ}, + {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, - {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"}, - {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"}, - {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"}, - {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"}, - {ORTH: "m'", LEMMA: "mo"},, - {ORTH: "Aib.", LEMMA: "Aibreán"}, - {ORTH: "Ath.", LEMMA: "athair"}, - {ORTH: "Beal.", LEMMA: "Bealtaine"}, - {ORTH: "Co.", LEMMA: "contae"}, - {ORTH: "Ean.", LEMMA: "Eanáir"}, - {ORTH: "Feab.", LEMMA: "Feabhra"}, - {ORTH: "gCo.", LEMMA: "contae"}, - {ORTH: ".i.", LEMMA: "eadhon"}, - {ORTH: "lch.", LEMMA: "leathanach"}, - {ORTH: "Lch.", LEMMA: "leathanach"}, - {ORTH: "lgh.", LEMMA: "leathanach"}, - {ORTH: "Lgh.", LEMMA: "leathanach"}, - {ORTH: "Lún.", LEMMA: "Lúnasa"}, - {ORTH: "Már.", LEMMA: "Márta"}, - {ORTH: "Meith.", LEMMA: "Meitheamh"}, - {ORTH: "Noll.", LEMMA: "Nollaig"}, - {ORTH: "Samh.", LEMMA: "Samhain"}, - {ORTH: "tAth.", LEMMA: "athair"}, - {ORTH: "tUas.", LEMMA: "Uasal"}, - {ORTH: "teo.", LEMMA: "teoranta"}, - {ORTH: "Teo.", LEMMA: "teoranta"}, - {ORTH: "Uas.", LEMMA: "Uasal"}, - {ORTH: "uimh.", LEMMA: "uimhir"}, - {ORTH: "Uimh.", LEMMA: "uimhir"}]: + {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV}, + {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV}, + {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV}, + {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV}, + {ORTH: "m'", LEMMA: "mo", POS: DET}, + {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, + {ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, + {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, + {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, + {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, + {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, + {ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, + {ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, + {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN}, + {ORTH: "Már.", LEMMA: "Márta", POS: NOUN}, + {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN}, + {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN}, + {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN}, + {ORTH: "tAth.", LEMMA: "athair", POS: NOUN}, + {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN}, + {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN}, + {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN}, + {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, + {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, + {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}]: _exc[exc_data[ORTH]] = [dict(exc_data)], for orth in [ From 5e5f94c1c0939da81dc939ed10c639f50557522c Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:57:00 +0100 Subject: [PATCH 05/25] fix dup --- spacy/lang/ga/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 3dca1c3d7..fad51a2fb 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -43,7 +43,7 @@ _exc = { {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}], "R.C.": [ - {ORTH: "Rr.", LEMMA: "roimh", POS: ADP}, + {ORTH: "R.", LEMMA: "roimh", POS: ADP}, {ORTH: "C.", LEMMA: "Críost", POS: NOUN}], "r.Ch.": [ From c1e4e0f3bf355eb7771759fcab58229f630e98e5 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 22:19:39 +0100 Subject: [PATCH 06/25] just now discovered that you can do multiwords --- spacy/lang/ga/tokenizer_exceptions.py | 56 ++++++--------------------- 1 file changed, 11 insertions(+), 45 deletions(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index fad51a2fb..afd901e33 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -25,51 +25,6 @@ _exc = { {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}], - "a.C.n.": [ - {ORTH: "a.", LEMMA: "ante", POS: X}, - {ORTH: "C.", LEMMA: "Christum", POS: X}, - {ORTH: "n.", LEMMA: "natum", POS: X}], - - "m.sh.": [ - {ORTH: "m.", LEMMA: "mar", POS: ADP}, - {ORTH: "sh.", LEMMA: "sampla", POS: NOUN}], - - "M.F.": [ - {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, - {ORTH: "F.", LEMMA: "Fómhar", POS: NOUN}], - - "M.Fómh.": [ - {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, - {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}], - - "R.C.": [ - {ORTH: "R.", LEMMA: "roimh", POS: ADP}, - {ORTH: "C.", LEMMA: "Críost", POS: NOUN}], - - "r.Ch.": [ - {ORTH: "r.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], - - "r.Chr.": [ - {ORTH: "r.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], - - "R.Ch.": [ - {ORTH: "R.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], - - "R.Chr.": [ - {ORTH: "R.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], - - "⁊rl.": [ - {ORTH: "⁊", LEMMA: "agus", POS: CCONJ}, - {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], - - "srl.": [ - {ORTH: "s", LEMMA: "agus", POS: CCONJ}, - {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], - } for exc_data in [ @@ -84,6 +39,17 @@ for exc_data in [ {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, {ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, + {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X}, + {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV}, + {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN}, + {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, + {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, + {ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, From e12defdd9ca8748f553f3360ffc0242ee234fd25 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 22:24:14 +0100 Subject: [PATCH 07/25] missed a couple --- spacy/lang/ga/tokenizer_exceptions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index afd901e33..2f6d1ebdf 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -45,9 +45,12 @@ for exc_data in [ {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, From 559e03605a52d2c68ba5e565ff69d0a09690f4f5 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 27 Jun 2017 22:42:16 +0100 Subject: [PATCH 08/25] b' --- spacy/lang/ga/tokenizer_exceptions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 2f6d1ebdf..7d29f4bcc 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -58,6 +58,8 @@ for exc_data in [ {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, {ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, {ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, + {ORTH: "B'", LEMMA: "ba", POS: AUX}, + {ORTH: "b'", LEMMA: "ba", POS: AUX}, {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, @@ -77,7 +79,7 @@ for exc_data in [ _exc[exc_data[ORTH]] = [dict(exc_data)], for orth in [ - "d'"]: + "d'", "D'"]: _exc[orth] = [{ORTH: orth}] From 1ba38b2036e69ea0ff400e14e217d887f09f7165 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Wed, 28 Jun 2017 00:42:00 +0100 Subject: [PATCH 09/25] some helpers; the Irish part of UD only has 2500 sentences so this will need source of morphology --- spacy/lang/ga/irish_morphology_helpers.py | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 spacy/lang/ga/irish_morphology_helpers.py diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py new file mode 100644 index 000000000..2b008f295 --- /dev/null +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -0,0 +1,33 @@ +# coding: utf8 +from __future__ import unicode_literals + +class IrishMorph: + consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] + broad_vowels = ['a', 'á', 'o', 'ó', 'u', 'ú'] + slender_vowels = ['e', 'é', 'i', 'í'] + vowels = broad_vowels + slender_vowels + + def ends_dentals(word): + if word[-1:] in ['d', 'n', 't', 's']: + return True + else: + return False + + def devoice(word): + if word[-2] == 's' and word[-1] == 'd': + return word[:-1] + 't' + else: + return word + + def ends_with_vowel(word): + return word[-1] in vowels + + def starts_with_vowel(word): + return word[0] in vowels + + def deduplicate(word): + if word[-2] == word[-1] and word[-1] in consonants: + return word[:-1] + else: + return word + From 70f4d26c108dbb9b2dcfbf4a1c90d9fdfcea2a7d Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Wed, 28 Jun 2017 10:59:46 +0100 Subject: [PATCH 10/25] bounds checks --- spacy/lang/ga/irish_morphology_helpers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index 2b008f295..383e24efc 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -8,25 +8,25 @@ class IrishMorph: vowels = broad_vowels + slender_vowels def ends_dentals(word): - if word[-1:] in ['d', 'n', 't', 's']: + if word != "" and word[-1] in ['d', 'n', 't', 's']: return True else: return False def devoice(word): - if word[-2] == 's' and word[-1] == 'd': + if len(word) > 2 and word[-2] == 's' and word[-1] == 'd': return word[:-1] + 't' else: return word def ends_with_vowel(word): - return word[-1] in vowels + return word != "" and word[-1] in vowels def starts_with_vowel(word): - return word[0] in vowels + return word != "" and word[0] in vowels def deduplicate(word): - if word[-2] == word[-1] and word[-1] in consonants: + if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: return word[:-1] else: return word From 76c22dec4dba150fd848072472d0e4bb65fc4a65 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 8 Aug 2017 19:04:52 +0100 Subject: [PATCH 11/25] UD Irish tag mapping --- spacy/lang/ga/tag_map.py | 366 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 spacy/lang/ga/tag_map.py diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py new file mode 100644 index 000000000..598d368bb --- /dev/null +++ b/spacy/lang/ga/tag_map.py @@ -0,0 +1,366 @@ +# coding: utf8 +from __future__ import unicode_literals + + +TAG_MAP = { + "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}}, + "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}}, + "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, + "ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}}, + "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}}, + "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}}, + "ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"}, + "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}}, + "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}}, + "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}}, + "ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"}, + "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, + "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, + "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, + "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, + "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"}, + "ADJ___": {"pos": "ADJ"}, + "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, + "ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"}, + "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3}, + "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, + "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Gender=Masc|Number=Sing|Person=3": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3}, + "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, + "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, + "ADP__Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1}, + "ADP__Number=Plur|Person=1|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 1, "Poss": "yes"}, + "ADP__Number=Plur|Person=1|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 1, "PronType": "emp"}, + "ADP__Number=Plur|Person=2": {"pos": "ADP", "Number": "plur", "Person": 2}, + "ADP__Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3}, + "ADP__Number=Plur|Person=3|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes"}, + "ADP__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Number=Plur|Person=3|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 3, "PronType": "emp"}, + "ADP__Number=Plur|PronType=Art": {"pos": "ADP", "Number": "plur", "PronType": "art"}, + "ADP__Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1}, + "ADP__Number=Sing|Person=1|Poss=Yes": {"pos": "ADP", "Number": "sing", "Person": 1, "Poss": "yes"}, + "ADP__Number=Sing|Person=1|PronType=Emp": {"pos": "ADP", "Number": "sing", "Person": 1, "PronType": "emp"}, + "ADP__Number=Sing|Person=2": {"pos": "ADP", "Number": "sing", "Person": 2}, + "ADP__Number=Sing|Person=3": {"pos": "ADP", "Number": "sing", "Person": 3}, + "ADP__Number=Sing|PronType=Art": {"pos": "ADP", "Number": "sing", "PronType": "art"}, + "ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"}, + "ADP___": {"pos": "ADP"}, + "ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"}, + "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}}, + "ADP__PronType=Art": {"pos": "ADP", "PronType": "art"}, + "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}}, + "ADV___": {"pos": "ADV"}, + "ADV__PronType=Int": {"pos": "ADV", "PronType": "int"}, + "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}}, + "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX___": {"pos": "AUX"}, + "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}}, + "CCONJ___": {"pos": "CCONJ"}, + "DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}}, + "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"}, + "DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def": {"pos": "DET", "Definite": "def"}, + "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}}, + "DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, + "DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, + "DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"}, + "DET__Number=Plur|Person=3|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 3, "Poss": "yes"}, + "DET__Number=Sing|Person=1|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 1, "Poss": "yes"}, + "DET__Number=Sing|Person=2|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 2, "Poss": "yes"}, + "DET__Number=Sing|PronType=Int": {"pos": "DET", "Number": "sing", "PronType": "int"}, + "DET___": {"pos": "DET"}, + "DET__PronType=Dem": {"pos": "DET", "PronType": "dem"}, + "DET__PronType=Ind": {"pos": "DET", "PronType": "ind"}, + "NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"}, + "NOUN__Case=Gen|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"}, + "NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"}, + "NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"}, + "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}}, + "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}}, + "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}}, + "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"}, + "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}}, + "NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"}, + "NOUN___": {"pos": "NOUN"}, + "NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"}, + "NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"}, + "NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"}, + "NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"}, + "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}}, + "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}}, + "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}}, + "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}}, + "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}}, + "NUM__NumType=Card": {"pos": "NUM", "NumType": "card"}, + "NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"}, + "NUM___": {"pos": "NUM"}, + "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}}, + "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}}, + "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}}, + "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}}, + "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}}, + "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}}, + "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}}, + "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}}, + "PART__PartType=Inf": {"pos": "PART", "PartType": "inf"}, + "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}}, + "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}}, + "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}}, + "PART___": {"pos": "PART"}, + "PART__PronType=Rel": {"pos": "PART", "PronType": "rel"}, + "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, + "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}}, + "PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3}, + "PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3}, + "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, + "PRON__Gender=Masc|Person=3": {"pos": "PRON", "Gender": "masc", "Person": 3}, + "PRON__Number=Plur|Person=1": {"pos": "PRON", "Number": "plur", "Person": 1}, + "PRON__Number=Plur|Person=1|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 1, "PronType": "emp"}, + "PRON__Number=Plur|Person=2": {"pos": "PRON", "Number": "plur", "Person": 2}, + "PRON__Number=Plur|Person=3": {"pos": "PRON", "Number": "plur", "Person": 3}, + "PRON__Number=Plur|Person=3|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 3, "PronType": "emp"}, + "PRON__Number=Sing|Person=1": {"pos": "PRON", "Number": "sing", "Person": 1}, + "PRON__Number=Sing|Person=1|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 1, "PronType": "emp"}, + "PRON__Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2}, + "PRON__Number=Sing|Person=2|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 2, "PronType": "emp"}, + "PRON__Number=Sing|Person=3": {"pos": "PRON", "Number": "sing", "Person": 3}, + "PRON__Number=Sing|PronType=Int": {"pos": "PRON", "Number": "sing", "PronType": "int"}, + "PRON__PronType=Dem": {"pos": "PRON", "PronType": "dem"}, + "PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"}, + "PRON__PronType=Int": {"pos": "PRON", "PronType": "int"}, + "PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"}, + "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}}, + "PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, + "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, + "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"}, + "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, + "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"}, + "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"}, + "PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"}, + "PROPN___": {"pos": "PROPN"}, + "PUNCT___": {"pos": "PUNCT"}, + "SCONJ___": {"pos": "SCONJ"}, + "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}}, + "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}}, + "VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"}, + "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"}, + "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}}, + "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}}, + "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}}, + "VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3}, + "VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1}, + "VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"}, + "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}}, + "VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"}, + "VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1}, + "VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2}, + "VERB__Mood=Imp|Number=Sing|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 2}, + "VERB__Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past"}, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past"}, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"}, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"}, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"}, + "VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"}, + "VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"}, + "VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"}, + "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"}, + "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"}, + "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}}, + "VERB___": {"pos": "VERB"}, + "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}}, + "X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"}, + "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}}, + "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}}, + "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}}, + "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}}, + "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}}, + "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}}, + "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}}, + "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, + "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, + "X___": {"pos": "X"} +} \ No newline at end of file From c069b4acb5317098d95d753a30160e3b52bbb209 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 8 Aug 2017 19:22:14 +0100 Subject: [PATCH 12/25] fix in UD submitted; map either way --- spacy/lang/ga/tag_map.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py index 598d368bb..22a6bacd0 100644 --- a/spacy/lang/ga/tag_map.py +++ b/spacy/lang/ga/tag_map.py @@ -25,7 +25,9 @@ TAG_MAP = { "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, + "ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, + "ADJ__Number=Plur|Case=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"}, "ADJ___": {"pos": "ADJ"}, "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, @@ -363,4 +365,4 @@ TAG_MAP = { "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, "X___": {"pos": "X"} -} \ No newline at end of file +} From c283e9edfe9618e5b48193dad4b0b1844ffee72a Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 08:57:48 +0100 Subject: [PATCH 13/25] first stab at test --- spacy/tests/lang/ga/__init__.py | 0 spacy/tests/lang/ga/test_tokenizer.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 spacy/tests/lang/ga/__init__.py create mode 100644 spacy/tests/lang/ga/test_tokenizer.py diff --git a/spacy/tests/lang/ga/__init__.py b/spacy/tests/lang/ga/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py new file mode 100644 index 000000000..fe5cb0b2f --- /dev/null +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +SV_TOKEN_EXCEPTION_TESTS = [ + ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']), + ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) +] + + +@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS) +def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): + tokens = ga_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list + From 187be6d372c8ef86c77483a6558f7592c3d0a2dc Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 09:33:17 +0100 Subject: [PATCH 14/25] copy/paste error --- spacy/tests/lang/ga/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index fe5cb0b2f..5b45dddc1 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -SV_TOKEN_EXCEPTION_TESTS = [ +GA_TOKEN_EXCEPTION_TESTS = [ ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']), ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) ] From 9dfd30196289536bf0bbc029d1b0d36c0adbc190 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 10:14:18 +0100 Subject: [PATCH 15/25] rearrange --- spacy/lang/ga/__init__.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 7b72a8a91..38b73468f 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -10,15 +10,16 @@ from ...attrs import LANG from ...util import update_exc +class IrishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ga' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + class Irish(Language): lang = 'ga' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'ga' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = IrishDefaults __all__ = ['Irish'] From b1b6123867209d18cfd5ab958731aac997f4f0d6 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 10:31:41 +0100 Subject: [PATCH 16/25] add ga_tokenizer --- spacy/tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index f5d65803a..1e9838d41 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -99,6 +99,10 @@ def sv_tokenizer(): def bn_tokenizer(): return util.get_lang_class('bn').Defaults.create_tokenizer() +@pytest.fixture +def ga_tokenizer(): + return util.get_lang_class('ga').Defaults.create_tokenizer() + @pytest.fixture def he_tokenizer(): From 7de709483bd9df2890672f2d17d8277d684d07d2 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 10:51:21 +0100 Subject: [PATCH 17/25] missed adding here --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1e9838d41..4da1ae301 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -12,7 +12,7 @@ from .. import util _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', - 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] + 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'ga', 'xx'] _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_depvec_web_lg'], From 34ca59691b6db947679552e899ca7209e41451db Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 31 Oct 2017 14:50:13 +0000 Subject: [PATCH 18/25] no idea what is wrong here --- spacy/tests/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 5fa0c0cb7..ee4093db3 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -14,9 +14,8 @@ from .. import util # These languages are used for generic tokenizer tests – only add a language # here if it's using spaCy's tokenizer (not a different library) # TODO: re-implement generic tokenizer tests -_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', - 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'ga', 'xx'] - +_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', + 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_depvec_web_lg'], @@ -107,6 +106,7 @@ def sv_tokenizer(): def bn_tokenizer(): return util.get_lang_class('bn').Defaults.create_tokenizer() + @pytest.fixture def ga_tokenizer(): return util.get_lang_class('ga').Defaults.create_tokenizer() From d4a8160c3641f122396f0fe49e39459ce952ab9f Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 31 Oct 2017 15:15:44 +0000 Subject: [PATCH 19/25] change quotes --- spacy/tests/lang/ga/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 5b45dddc1..9cfbd555e 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -5,7 +5,7 @@ import pytest GA_TOKEN_EXCEPTION_TESTS = [ - ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']), + ("B'fhearr fanacht as amharc", ["B'", "fhearr", "fanacht", "as", "amharc"]), ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) ] From 9b0de9fb43fc5fccaeb3115c5b24c123b45e89ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Oct 2017 19:17:58 +0100 Subject: [PATCH 20/25] Fix import of symbols (now nested one level lower) --- spacy/lang/ga/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 7d29f4bcc..70ee051e9 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import ORTH, LEMMA, NORM, POS +from ...symbols import ORTH, LEMMA, NORM, POS _exc = { From 147448b65b5781ce692000e3efa13cde526aa6d6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Oct 2017 19:34:45 +0100 Subject: [PATCH 21/25] Add missing symbols --- spacy/lang/ga/tokenizer_exceptions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 70ee051e9..185b08895 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,7 +1,8 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM, POS +from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX +from ...symbols import ORTH, LEMMA, NORM _exc = { From 06c25a888244e8520d5eeb2df8d5d86499325f48 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Oct 2017 20:13:16 +0100 Subject: [PATCH 22/25] Remove comma that caused list to wrap in tuple! Also removed extra dict wrappings for performance (we used to have them in there, but they should only really exist if copying the dict is absolutely necessary) --- spacy/lang/ga/tokenizer_exceptions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 185b08895..e93ada52f 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -24,8 +24,7 @@ _exc = { "led'": [ {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, - {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}], - + {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}] } for exc_data in [ @@ -77,11 +76,11 @@ for exc_data in [ {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}]: - _exc[exc_data[ORTH]] = [dict(exc_data)], + _exc[exc_data[ORTH]] = [exc_data] for orth in [ "d'", "D'"]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc From fe4b10346a4e625ee3d286262b76f5a248e68a24 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 31 Oct 2017 20:24:53 +0000 Subject: [PATCH 23/25] replace example sentence until I get around to adding a punctuation.py --- spacy/tests/lang/ga/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 9cfbd555e..1c6f68bad 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -5,7 +5,7 @@ import pytest GA_TOKEN_EXCEPTION_TESTS = [ - ("B'fhearr fanacht as amharc", ["B'", "fhearr", "fanacht", "as", "amharc"]), + ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']), ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) ] From 25b1d6cd9151b1544e36295bb1a1e171abf268c0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 31 Oct 2017 22:36:03 +0100 Subject: [PATCH 24/25] Fix syntax error --- spacy/tests/lang/ga/test_tokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 1c6f68bad..cf3caf6de 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -5,7 +5,7 @@ import pytest GA_TOKEN_EXCEPTION_TESTS = [ - ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']), + ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']), ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) ] @@ -15,4 +15,3 @@ def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): tokens = ga_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list - From 00ecfa5417e6ceff9a2ef55ad36ffe475aa2e65b Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 31 Oct 2017 22:54:42 +0000 Subject: [PATCH 25/25] =?UTF-8?q?=C3=93,=20not=20O?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spacy/tests/lang/ga/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 1c6f68bad..8aa917a6f 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -5,7 +5,7 @@ import pytest GA_TOKEN_EXCEPTION_TESTS = [ - ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']), + ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']), ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) ]