From 1eb7cc3017a6def34fb448781578888764d1e659 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:24:55 +0100 Subject: [PATCH 01/90] attempt a port from #1147 --- spacy/lang/ga/__init__.py | 24 ++++++ spacy/lang/ga/stop_words.py | 45 ++++++++++ spacy/lang/ga/tokenizer_exceptions.py | 115 ++++++++++++++++++++++++++ 3 files changed, 184 insertions(+) create mode 100644 spacy/lang/ga/__init__.py create mode 100644 spacy/lang/ga/stop_words.py create mode 100644 spacy/lang/ga/tokenizer_exceptions.py diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py new file mode 100644 index 000000000..8231cc925 --- /dev/null +++ b/spacy/lang/ga/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class Irish(Language): + lang = 'nb' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ga' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + +__all__ = ['Irish'] diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py new file mode 100644 index 000000000..816c00b13 --- /dev/null +++ b/spacy/lang/ga/stop_words.py @@ -0,0 +1,45 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a ach ag agus an aon ar arna as + +ba beirt bhúr + +caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear + +daichead dar de deich deichniúr den dhá do don dtí dá dár dó + +faoi faoin faoina faoinár fara fiche + +gach gan go gur + +haon hocht + +i iad idir in ina ins inár is + +le leis lena lenár + +mar mo muid mé + +na nach naoi naonúr ná ní níor nó nócha + +ocht ochtar ochtó os + +roimh + +sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí + +tar thar thú triúr trí trína trínár tríocha tú + +um + +ár + +é éis + +í + +ó ón óna ónár +""".split()) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py new file mode 100644 index 000000000..ce280a3a2 --- /dev/null +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -0,0 +1,115 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import ORTH, LEMMA, NORM + + +_exc = { + "'acha'n": [ + {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}], + + "dem'": [ + {ORTH: "de", LEMMA: "de", NORM: "de"}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + + "ded'": [ + {ORTH: "de", LEMMA: "de", NORM: "de"}, + {ORTH: "d'", LEMMA: "do", NORM: "do"}], + + "lem'": [ + {ORTH: "le", LEMMA: "le", NORM: "le"}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + + "led'": [ + {ORTH: "le", LEMMA: "le", NORM: "le"}, + {ORTH: "d'", LEMMA: "mo", NORM: "do"}], + + "a.C.n.": [ + {ORTH: "a.", LEMMA: "ante"}, + {ORTH: "C.", LEMMA: "Christum"}, + {ORTH: "n.", LEMMA: "natum"}], + + "m.sh.": [ + {ORTH: "m.", LEMMA: "mar"}, + {ORTH: "sh.", LEMMA: "sampla"}], + + "M.F.": [ + {ORTH: "M.", LEMMA: "Meán"}, + {ORTH: "F.", LEMMA: "Fómhar"}], + + "M.Fómh.": [ + {ORTH: "M.", LEMMA: "Meán"}, + {ORTH: "Fómh.", LEMMA: "Fómhar"}], + + "R.C.": [ + {ORTH: "Rr.", LEMMA: "roimh"}, + {ORTH: "C.", LEMMA: "Críost"}], + + "r.Ch.": [ + {ORTH: "r.", LEMMA: "roimh"}, + {ORTH: "Ch.", LEMMA: "Críost"}], + + "r.Chr.": [ + {ORTH: "r.", LEMMA: "roimh"}, + {ORTH: "Chr.", LEMMA: "Críost"}], + + "R.Ch.": [ + {ORTH: "R.", LEMMA: "roimh"}, + {ORTH: "Ch.", LEMMA: "Críost"}], + + "R.Chr.": [ + {ORTH: "R.", LEMMA: "roimh"}, + {ORTH: "Chr.", LEMMA: "Críost"}], + + "⁊rl.": [ + {ORTH: "⁊", LEMMA: "agus"}, + {ORTH: "rl.", LEMMA: "araile"}], + + "srl.": [ + {ORTH: "s", LEMMA: "agus"}, + {ORTH: "rl.", LEMMA: "araile"}], + +} + +for exc_data in [ + {ORTH: "'gus", LEMMA: "agus", NORM: "agus"}, + {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, + {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"}, + {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"}, + {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"}, + {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"}, + {ORTH: "m'", LEMMA: "mo"},, + {ORTH: "Aib.", LEMMA: "Aibreán"}, + {ORTH: "Ath.", LEMMA: "athair"}, + {ORTH: "Beal.", LEMMA: "Bealtaine"}, + {ORTH: "Co.", LEMMA: "contae"}, + {ORTH: "Ean.", LEMMA: "Eanáir"}, + {ORTH: "Feab.", LEMMA: "Feabhra"}, + {ORTH: "gCo.", LEMMA: "contae"}, + {ORTH: ".i.", LEMMA: "eadhon"}, + {ORTH: "lch.", LEMMA: "leathanach"}, + {ORTH: "Lch.", LEMMA: "leathanach"}, + {ORTH: "lgh.", LEMMA: "leathanach"}, + {ORTH: "Lgh.", LEMMA: "leathanach"}, + {ORTH: "Lún.", LEMMA: "Lúnasa"}, + {ORTH: "Már.", LEMMA: "Márta"}, + {ORTH: "Meith.", LEMMA: "Meitheamh"}, + {ORTH: "Noll.", LEMMA: "Nollaig"}, + {ORTH: "Samh.", LEMMA: "Samhain"}, + {ORTH: "tAth.", LEMMA: "athair"}, + {ORTH: "tUas.", LEMMA: "Uasal"}, + {ORTH: "teo.", LEMMA: "teoranta"}, + {ORTH: "Teo.", LEMMA: "teoranta"}, + {ORTH: "Uas.", LEMMA: "Uasal"}, + {ORTH: "uimh.", LEMMA: "uimhir"}, + {ORTH: "Uimh.", LEMMA: "uimhir"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)], + +for orth in [ + "d'"]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = dict(_exc) From e9213f54deece142fff6c4ff0a2ae4106288f417 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:29:21 +0100 Subject: [PATCH 02/90] missed one --- spacy/lang/ga/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 8231cc925..7b72a8a91 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -11,7 +11,7 @@ from ...util import update_exc class Irish(Language): - lang = 'nb' + lang = 'ga' class Defaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) From 3c4d83aa6e634b19889338bdf3c0dfd593f9fdc6 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sat, 24 Jun 2017 22:29:02 +0100 Subject: [PATCH 03/90] CLA --- .github/contributors/jimregan.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/jimregan.md diff --git a/.github/contributors/jimregan.md b/.github/contributors/jimregan.md new file mode 100644 index 000000000..dd8fe3d64 --- /dev/null +++ b/.github/contributors/jimregan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jim O'Regan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-06-24 | +| GitHub username | jimregan | +| Website (optional) | | From a8dff9133e84671a3111390a8f4e8965ec744519 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:53:41 +0100 Subject: [PATCH 04/90] add POS --- spacy/lang/ga/tokenizer_exceptions.py | 130 +++++++++++++------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index ce280a3a2..3dca1c3d7 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,110 +1,110 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import ORTH, LEMMA, NORM +from ..symbols import ORTH, LEMMA, NORM, POS _exc = { "'acha'n": [ - {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, - {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}], + {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, + {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}], "dem'": [ - {ORTH: "de", LEMMA: "de", NORM: "de"}, - {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}], "ded'": [ - {ORTH: "de", LEMMA: "de", NORM: "de"}, - {ORTH: "d'", LEMMA: "do", NORM: "do"}], + {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, + {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET}], "lem'": [ - {ORTH: "le", LEMMA: "le", NORM: "le"}, - {ORTH: "m'", LEMMA: "mo", NORM: "mo"}], + {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}], "led'": [ - {ORTH: "le", LEMMA: "le", NORM: "le"}, - {ORTH: "d'", LEMMA: "mo", NORM: "do"}], + {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, + {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}], "a.C.n.": [ - {ORTH: "a.", LEMMA: "ante"}, - {ORTH: "C.", LEMMA: "Christum"}, - {ORTH: "n.", LEMMA: "natum"}], + {ORTH: "a.", LEMMA: "ante", POS: X}, + {ORTH: "C.", LEMMA: "Christum", POS: X}, + {ORTH: "n.", LEMMA: "natum", POS: X}], "m.sh.": [ - {ORTH: "m.", LEMMA: "mar"}, - {ORTH: "sh.", LEMMA: "sampla"}], + {ORTH: "m.", LEMMA: "mar", POS: ADP}, + {ORTH: "sh.", LEMMA: "sampla", POS: NOUN}], "M.F.": [ - {ORTH: "M.", LEMMA: "Meán"}, - {ORTH: "F.", LEMMA: "Fómhar"}], + {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, + {ORTH: "F.", LEMMA: "Fómhar", POS: NOUN}], "M.Fómh.": [ - {ORTH: "M.", LEMMA: "Meán"}, - {ORTH: "Fómh.", LEMMA: "Fómhar"}], + {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, + {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}], "R.C.": [ - {ORTH: "Rr.", LEMMA: "roimh"}, - {ORTH: "C.", LEMMA: "Críost"}], + {ORTH: "Rr.", LEMMA: "roimh", POS: ADP}, + {ORTH: "C.", LEMMA: "Críost", POS: NOUN}], "r.Ch.": [ - {ORTH: "r.", LEMMA: "roimh"}, - {ORTH: "Ch.", LEMMA: "Críost"}], + {ORTH: "r.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], "r.Chr.": [ - {ORTH: "r.", LEMMA: "roimh"}, - {ORTH: "Chr.", LEMMA: "Críost"}], + {ORTH: "r.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], "R.Ch.": [ - {ORTH: "R.", LEMMA: "roimh"}, - {ORTH: "Ch.", LEMMA: "Críost"}], + {ORTH: "R.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], "R.Chr.": [ - {ORTH: "R.", LEMMA: "roimh"}, - {ORTH: "Chr.", LEMMA: "Críost"}], + {ORTH: "R.", LEMMA: "roimh", POS: ADP}, + {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], "⁊rl.": [ - {ORTH: "⁊", LEMMA: "agus"}, - {ORTH: "rl.", LEMMA: "araile"}], + {ORTH: "⁊", LEMMA: "agus", POS: CCONJ}, + {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], "srl.": [ - {ORTH: "s", LEMMA: "agus"}, - {ORTH: "rl.", LEMMA: "araile"}], + {ORTH: "s", LEMMA: "agus", POS: CCONJ}, + {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], } for exc_data in [ - {ORTH: "'gus", LEMMA: "agus", NORM: "agus"}, - {ORTH: "'ach", LEMMA: "gach", NORM: "gach"}, + {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ}, + {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, - {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"}, - {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"}, - {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"}, - {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"}, - {ORTH: "m'", LEMMA: "mo"},, - {ORTH: "Aib.", LEMMA: "Aibreán"}, - {ORTH: "Ath.", LEMMA: "athair"}, - {ORTH: "Beal.", LEMMA: "Bealtaine"}, - {ORTH: "Co.", LEMMA: "contae"}, - {ORTH: "Ean.", LEMMA: "Eanáir"}, - {ORTH: "Feab.", LEMMA: "Feabhra"}, - {ORTH: "gCo.", LEMMA: "contae"}, - {ORTH: ".i.", LEMMA: "eadhon"}, - {ORTH: "lch.", LEMMA: "leathanach"}, - {ORTH: "Lch.", LEMMA: "leathanach"}, - {ORTH: "lgh.", LEMMA: "leathanach"}, - {ORTH: "Lgh.", LEMMA: "leathanach"}, - {ORTH: "Lún.", LEMMA: "Lúnasa"}, - {ORTH: "Már.", LEMMA: "Márta"}, - {ORTH: "Meith.", LEMMA: "Meitheamh"}, - {ORTH: "Noll.", LEMMA: "Nollaig"}, - {ORTH: "Samh.", LEMMA: "Samhain"}, - {ORTH: "tAth.", LEMMA: "athair"}, - {ORTH: "tUas.", LEMMA: "Uasal"}, - {ORTH: "teo.", LEMMA: "teoranta"}, - {ORTH: "Teo.", LEMMA: "teoranta"}, - {ORTH: "Uas.", LEMMA: "Uasal"}, - {ORTH: "uimh.", LEMMA: "uimhir"}, - {ORTH: "Uimh.", LEMMA: "uimhir"}]: + {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV}, + {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV}, + {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV}, + {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV}, + {ORTH: "m'", LEMMA: "mo", POS: DET}, + {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, + {ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, + {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, + {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, + {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, + {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, + {ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, + {ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, + {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN}, + {ORTH: "Már.", LEMMA: "Márta", POS: NOUN}, + {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN}, + {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN}, + {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN}, + {ORTH: "tAth.", LEMMA: "athair", POS: NOUN}, + {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN}, + {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN}, + {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN}, + {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, + {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, + {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}]: _exc[exc_data[ORTH]] = [dict(exc_data)], for orth in [ From 5e5f94c1c0939da81dc939ed10c639f50557522c Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 21:57:00 +0100 Subject: [PATCH 05/90] fix dup --- spacy/lang/ga/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 3dca1c3d7..fad51a2fb 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -43,7 +43,7 @@ _exc = { {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}], "R.C.": [ - {ORTH: "Rr.", LEMMA: "roimh", POS: ADP}, + {ORTH: "R.", LEMMA: "roimh", POS: ADP}, {ORTH: "C.", LEMMA: "Críost", POS: NOUN}], "r.Ch.": [ From c1e4e0f3bf355eb7771759fcab58229f630e98e5 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 22:19:39 +0100 Subject: [PATCH 06/90] just now discovered that you can do multiwords --- spacy/lang/ga/tokenizer_exceptions.py | 56 ++++++--------------------- 1 file changed, 11 insertions(+), 45 deletions(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index fad51a2fb..afd901e33 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -25,51 +25,6 @@ _exc = { {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}], - "a.C.n.": [ - {ORTH: "a.", LEMMA: "ante", POS: X}, - {ORTH: "C.", LEMMA: "Christum", POS: X}, - {ORTH: "n.", LEMMA: "natum", POS: X}], - - "m.sh.": [ - {ORTH: "m.", LEMMA: "mar", POS: ADP}, - {ORTH: "sh.", LEMMA: "sampla", POS: NOUN}], - - "M.F.": [ - {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, - {ORTH: "F.", LEMMA: "Fómhar", POS: NOUN}], - - "M.Fómh.": [ - {ORTH: "M.", LEMMA: "Meán", POS: NOUN}, - {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}], - - "R.C.": [ - {ORTH: "R.", LEMMA: "roimh", POS: ADP}, - {ORTH: "C.", LEMMA: "Críost", POS: NOUN}], - - "r.Ch.": [ - {ORTH: "r.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], - - "r.Chr.": [ - {ORTH: "r.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], - - "R.Ch.": [ - {ORTH: "R.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}], - - "R.Chr.": [ - {ORTH: "R.", LEMMA: "roimh", POS: ADP}, - {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}], - - "⁊rl.": [ - {ORTH: "⁊", LEMMA: "agus", POS: CCONJ}, - {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], - - "srl.": [ - {ORTH: "s", LEMMA: "agus", POS: CCONJ}, - {ORTH: "rl.", LEMMA: "araile", POS: ADJ}], - } for exc_data in [ @@ -84,6 +39,17 @@ for exc_data in [ {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, {ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, + {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X}, + {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV}, + {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN}, + {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, + {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, + {ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, From e12defdd9ca8748f553f3360ffc0242ee234fd25 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 22:24:14 +0100 Subject: [PATCH 07/90] missed a couple --- spacy/lang/ga/tokenizer_exceptions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index afd901e33..2f6d1ebdf 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -45,9 +45,12 @@ for exc_data in [ {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, From 559e03605a52d2c68ba5e565ff69d0a09690f4f5 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 27 Jun 2017 22:42:16 +0100 Subject: [PATCH 08/90] b' --- spacy/lang/ga/tokenizer_exceptions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index 2f6d1ebdf..7d29f4bcc 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -58,6 +58,8 @@ for exc_data in [ {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, {ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, {ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, + {ORTH: "B'", LEMMA: "ba", POS: AUX}, + {ORTH: "b'", LEMMA: "ba", POS: AUX}, {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, @@ -77,7 +79,7 @@ for exc_data in [ _exc[exc_data[ORTH]] = [dict(exc_data)], for orth in [ - "d'"]: + "d'", "D'"]: _exc[orth] = [{ORTH: orth}] From 1ba38b2036e69ea0ff400e14e217d887f09f7165 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Wed, 28 Jun 2017 00:42:00 +0100 Subject: [PATCH 09/90] some helpers; the Irish part of UD only has 2500 sentences so this will need source of morphology --- spacy/lang/ga/irish_morphology_helpers.py | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 spacy/lang/ga/irish_morphology_helpers.py diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py new file mode 100644 index 000000000..2b008f295 --- /dev/null +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -0,0 +1,33 @@ +# coding: utf8 +from __future__ import unicode_literals + +class IrishMorph: + consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] + broad_vowels = ['a', 'á', 'o', 'ó', 'u', 'ú'] + slender_vowels = ['e', 'é', 'i', 'í'] + vowels = broad_vowels + slender_vowels + + def ends_dentals(word): + if word[-1:] in ['d', 'n', 't', 's']: + return True + else: + return False + + def devoice(word): + if word[-2] == 's' and word[-1] == 'd': + return word[:-1] + 't' + else: + return word + + def ends_with_vowel(word): + return word[-1] in vowels + + def starts_with_vowel(word): + return word[0] in vowels + + def deduplicate(word): + if word[-2] == word[-1] and word[-1] in consonants: + return word[:-1] + else: + return word + From 70f4d26c108dbb9b2dcfbf4a1c90d9fdfcea2a7d Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Wed, 28 Jun 2017 10:59:46 +0100 Subject: [PATCH 10/90] bounds checks --- spacy/lang/ga/irish_morphology_helpers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index 2b008f295..383e24efc 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -8,25 +8,25 @@ class IrishMorph: vowels = broad_vowels + slender_vowels def ends_dentals(word): - if word[-1:] in ['d', 'n', 't', 's']: + if word != "" and word[-1] in ['d', 'n', 't', 's']: return True else: return False def devoice(word): - if word[-2] == 's' and word[-1] == 'd': + if len(word) > 2 and word[-2] == 's' and word[-1] == 'd': return word[:-1] + 't' else: return word def ends_with_vowel(word): - return word[-1] in vowels + return word != "" and word[-1] in vowels def starts_with_vowel(word): - return word[0] in vowels + return word != "" and word[0] in vowels def deduplicate(word): - if word[-2] == word[-1] and word[-1] in consonants: + if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: return word[:-1] else: return word From 76c22dec4dba150fd848072472d0e4bb65fc4a65 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 8 Aug 2017 19:04:52 +0100 Subject: [PATCH 11/90] UD Irish tag mapping --- spacy/lang/ga/tag_map.py | 366 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 spacy/lang/ga/tag_map.py diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py new file mode 100644 index 000000000..598d368bb --- /dev/null +++ b/spacy/lang/ga/tag_map.py @@ -0,0 +1,366 @@ +# coding: utf8 +from __future__ import unicode_literals + + +TAG_MAP = { + "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}}, + "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}}, + "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, + "ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}}, + "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}}, + "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}}, + "ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"}, + "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}}, + "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}}, + "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}}, + "ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"}, + "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, + "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, + "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, + "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, + "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"}, + "ADJ___": {"pos": "ADJ"}, + "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, + "ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"}, + "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3}, + "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, + "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Gender=Masc|Number=Sing|Person=3": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3}, + "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, + "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, + "ADP__Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1}, + "ADP__Number=Plur|Person=1|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 1, "Poss": "yes"}, + "ADP__Number=Plur|Person=1|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 1, "PronType": "emp"}, + "ADP__Number=Plur|Person=2": {"pos": "ADP", "Number": "plur", "Person": 2}, + "ADP__Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3}, + "ADP__Number=Plur|Person=3|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes"}, + "ADP__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Number=Plur|Person=3|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 3, "PronType": "emp"}, + "ADP__Number=Plur|PronType=Art": {"pos": "ADP", "Number": "plur", "PronType": "art"}, + "ADP__Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1}, + "ADP__Number=Sing|Person=1|Poss=Yes": {"pos": "ADP", "Number": "sing", "Person": 1, "Poss": "yes"}, + "ADP__Number=Sing|Person=1|PronType=Emp": {"pos": "ADP", "Number": "sing", "Person": 1, "PronType": "emp"}, + "ADP__Number=Sing|Person=2": {"pos": "ADP", "Number": "sing", "Person": 2}, + "ADP__Number=Sing|Person=3": {"pos": "ADP", "Number": "sing", "Person": 3}, + "ADP__Number=Sing|PronType=Art": {"pos": "ADP", "Number": "sing", "PronType": "art"}, + "ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"}, + "ADP___": {"pos": "ADP"}, + "ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"}, + "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}}, + "ADP__PronType=Art": {"pos": "ADP", "PronType": "art"}, + "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}}, + "ADV___": {"pos": "ADV"}, + "ADV__PronType=Int": {"pos": "ADV", "PronType": "int"}, + "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}}, + "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX___": {"pos": "AUX"}, + "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}}, + "CCONJ___": {"pos": "CCONJ"}, + "DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}}, + "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"}, + "DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def": {"pos": "DET", "Definite": "def"}, + "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}}, + "DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, + "DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, + "DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"}, + "DET__Number=Plur|Person=3|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 3, "Poss": "yes"}, + "DET__Number=Sing|Person=1|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 1, "Poss": "yes"}, + "DET__Number=Sing|Person=2|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 2, "Poss": "yes"}, + "DET__Number=Sing|PronType=Int": {"pos": "DET", "Number": "sing", "PronType": "int"}, + "DET___": {"pos": "DET"}, + "DET__PronType=Dem": {"pos": "DET", "PronType": "dem"}, + "DET__PronType=Ind": {"pos": "DET", "PronType": "ind"}, + "NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"}, + "NOUN__Case=Gen|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"}, + "NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"}, + "NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"}, + "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}}, + "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}}, + "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}}, + "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"}, + "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}}, + "NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"}, + "NOUN___": {"pos": "NOUN"}, + "NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"}, + "NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"}, + "NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"}, + "NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"}, + "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}}, + "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}}, + "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}}, + "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}}, + "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}}, + "NUM__NumType=Card": {"pos": "NUM", "NumType": "card"}, + "NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"}, + "NUM___": {"pos": "NUM"}, + "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}}, + "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}}, + "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}}, + "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}}, + "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}}, + "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}}, + "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}}, + "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}}, + "PART__PartType=Inf": {"pos": "PART", "PartType": "inf"}, + "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}}, + "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}}, + "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}}, + "PART___": {"pos": "PART"}, + "PART__PronType=Rel": {"pos": "PART", "PronType": "rel"}, + "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, + "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}}, + "PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3}, + "PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3}, + "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, + "PRON__Gender=Masc|Person=3": {"pos": "PRON", "Gender": "masc", "Person": 3}, + "PRON__Number=Plur|Person=1": {"pos": "PRON", "Number": "plur", "Person": 1}, + "PRON__Number=Plur|Person=1|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 1, "PronType": "emp"}, + "PRON__Number=Plur|Person=2": {"pos": "PRON", "Number": "plur", "Person": 2}, + "PRON__Number=Plur|Person=3": {"pos": "PRON", "Number": "plur", "Person": 3}, + "PRON__Number=Plur|Person=3|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 3, "PronType": "emp"}, + "PRON__Number=Sing|Person=1": {"pos": "PRON", "Number": "sing", "Person": 1}, + "PRON__Number=Sing|Person=1|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 1, "PronType": "emp"}, + "PRON__Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2}, + "PRON__Number=Sing|Person=2|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 2, "PronType": "emp"}, + "PRON__Number=Sing|Person=3": {"pos": "PRON", "Number": "sing", "Person": 3}, + "PRON__Number=Sing|PronType=Int": {"pos": "PRON", "Number": "sing", "PronType": "int"}, + "PRON__PronType=Dem": {"pos": "PRON", "PronType": "dem"}, + "PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"}, + "PRON__PronType=Int": {"pos": "PRON", "PronType": "int"}, + "PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"}, + "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}}, + "PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, + "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, + "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"}, + "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, + "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"}, + "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"}, + "PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"}, + "PROPN___": {"pos": "PROPN"}, + "PUNCT___": {"pos": "PUNCT"}, + "SCONJ___": {"pos": "SCONJ"}, + "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}}, + "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}}, + "VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"}, + "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"}, + "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}}, + "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}}, + "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}}, + "VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3}, + "VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1}, + "VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"}, + "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}}, + "VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"}, + "VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1}, + "VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2}, + "VERB__Mood=Imp|Number=Sing|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 2}, + "VERB__Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past"}, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past"}, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"}, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"}, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"}, + "VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"}, + "VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"}, + "VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"}, + "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"}, + "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"}, + "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}}, + "VERB___": {"pos": "VERB"}, + "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}}, + "X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"}, + "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}}, + "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}}, + "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}}, + "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}}, + "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}}, + "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}}, + "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}}, + "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, + "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, + "X___": {"pos": "X"} +} \ No newline at end of file From c069b4acb5317098d95d753a30160e3b52bbb209 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Tue, 8 Aug 2017 19:22:14 +0100 Subject: [PATCH 12/90] fix in UD submitted; map either way --- spacy/lang/ga/tag_map.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py index 598d368bb..22a6bacd0 100644 --- a/spacy/lang/ga/tag_map.py +++ b/spacy/lang/ga/tag_map.py @@ -25,7 +25,9 @@ TAG_MAP = { "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, + "ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, + "ADJ__Number=Plur|Case=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"}, "ADJ___": {"pos": "ADJ"}, "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, @@ -363,4 +365,4 @@ TAG_MAP = { "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, "X___": {"pos": "X"} -} \ No newline at end of file +} From c283e9edfe9618e5b48193dad4b0b1844ffee72a Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 08:57:48 +0100 Subject: [PATCH 13/90] first stab at test --- spacy/tests/lang/ga/__init__.py | 0 spacy/tests/lang/ga/test_tokenizer.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 spacy/tests/lang/ga/__init__.py create mode 100644 spacy/tests/lang/ga/test_tokenizer.py diff --git a/spacy/tests/lang/ga/__init__.py b/spacy/tests/lang/ga/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py new file mode 100644 index 000000000..fe5cb0b2f --- /dev/null +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +SV_TOKEN_EXCEPTION_TESTS = [ + ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']), + ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) +] + + +@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS) +def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): + tokens = ga_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list + From 187be6d372c8ef86c77483a6558f7592c3d0a2dc Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 09:33:17 +0100 Subject: [PATCH 14/90] copy/paste error --- spacy/tests/lang/ga/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index fe5cb0b2f..5b45dddc1 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -SV_TOKEN_EXCEPTION_TESTS = [ +GA_TOKEN_EXCEPTION_TESTS = [ ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']), ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) ] From 9dfd30196289536bf0bbc029d1b0d36c0adbc190 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 10:14:18 +0100 Subject: [PATCH 15/90] rearrange --- spacy/lang/ga/__init__.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 7b72a8a91..38b73468f 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -10,15 +10,16 @@ from ...attrs import LANG from ...util import update_exc +class IrishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ga' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + class Irish(Language): lang = 'ga' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'ga' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = IrishDefaults __all__ = ['Irish'] From b1b6123867209d18cfd5ab958731aac997f4f0d6 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 10:31:41 +0100 Subject: [PATCH 16/90] add ga_tokenizer --- spacy/tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index f5d65803a..1e9838d41 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -99,6 +99,10 @@ def sv_tokenizer(): def bn_tokenizer(): return util.get_lang_class('bn').Defaults.create_tokenizer() +@pytest.fixture +def ga_tokenizer(): + return util.get_lang_class('ga').Defaults.create_tokenizer() + @pytest.fixture def he_tokenizer(): From 7de709483bd9df2890672f2d17d8277d684d07d2 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 11 Sep 2017 10:51:21 +0100 Subject: [PATCH 17/90] missed adding here --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1e9838d41..4da1ae301 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -12,7 +12,7 @@ from .. import util _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', - 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] + 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'ga', 'xx'] _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_depvec_web_lg'], From 8db3da3c3dbe70687ba39030b2fa513cb74d8749 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 30 Oct 2017 14:06:25 +0100 Subject: [PATCH 18/90] Refactor JS, split into modules and add nomodule option rollup.js will be compiled by the rollup package and Babel on build, and will be loaded if a browser doesn't yet support JS modules --- website/_harp.json | 4 +- website/_includes/_scripts.jade | 81 +++-- website/assets/js/changelog.js | 72 ++++ website/assets/js/github-embed.js | 36 ++ website/assets/js/main.js | 323 ------------------ website/assets/js/models.js | 160 +++++++++ website/assets/js/nav-highlighter.js | 33 ++ website/assets/js/progress.js | 52 +++ website/assets/js/rollup.js | 23 ++ website/assets/js/util.js | 56 +++ website/assets/js/{ => vendor}/chart.min.js | 0 website/assets/js/{ => vendor}/in-view.min.js | 0 website/assets/js/{ => vendor}/prism.min.js | 0 .../assets/js/{ => vendor}/quickstart.min.js | 0 14 files changed, 493 insertions(+), 347 deletions(-) create mode 100644 website/assets/js/changelog.js create mode 100644 website/assets/js/github-embed.js delete mode 100644 website/assets/js/main.js create mode 100644 website/assets/js/models.js create mode 100644 website/assets/js/nav-highlighter.js create mode 100644 website/assets/js/progress.js create mode 100644 website/assets/js/rollup.js create mode 100644 website/assets/js/util.js rename website/assets/js/{ => vendor}/chart.min.js (100%) rename website/assets/js/{ => vendor}/in-view.min.js (100%) rename website/assets/js/{ => vendor}/prism.min.js (100%) rename website/assets/js/{ => vendor}/quickstart.min.js (100%) diff --git a/website/_harp.json b/website/_harp.json index 7c69beef0..bc1a0b5e5 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -84,8 +84,8 @@ ], "ALPHA": true, - "V_CSS": "2.0a1", - "V_JS": "2.0a0", + "V_CSS": "2.0a2", + "V_JS": "2.0a1", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", "MAILCHIMP": { diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade index 5ecdd0711..e1d9f773a 100644 --- a/website/_includes/_scripts.jade +++ b/website/_includes/_scripts.jade @@ -1,43 +1,80 @@ //- 💫 INCLUDES > SCRIPTS if quickstart - script(src="/assets/js/quickstart.min.js") + script(src="/assets/js/vendor/quickstart.min.js") if IS_PAGE - script(src="/assets/js/in-view.min.js") + script(src="/assets/js/vendor/in-view.min.js") if environment == "deploy" script(async src="https://www.google-analytics.com/analytics.js") -script(src="/assets/js/prism.min.js") -script(src="/assets/js/main.js?v#{V_JS}") +script(src="/assets/js/vendor/prism.min.js") + +if SECTION == "models" + script(src="/assets/js/vendor/chart.min.js") + script(src="/assets/js/models.js?v#{V_JS}" type="module") script - | new ProgressBar('.js-progress'); - - if changelog - | new Changelog('!{SOCIAL.github}', 'spacy'); - if quickstart | new Quickstart("#qs"); - if IS_PAGE - | new SectionHighlighter('data-section', 'data-nav'); - | new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed'); - | ((window.gitter = {}).chat = {}).options = { - | useStyles: false, - | activationElement: '.js-gitter-button', - | targetElement: '.js-gitter', - | room: '!{SOCIAL.gitter}' - | }; - - if HAS_MODELS - | new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_BENCHMARKS)}); - if environment == "deploy" | window.ga=window.ga||function(){ | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); + if IS_PAGE + script + | ((window.gitter = {}).chat = {}).options = { + | useStyles: false, + | activationElement: '.js-gitter-button', + | targetElement: '.js-gitter', + | room: '!{SOCIAL.gitter}' + | }; script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) + + +//- JS modules – slightly hacky, but necessary to dynamically instantiate the + classes with data from the Harp JSON files, while still being able to + support older browsers that can't handle JS modules. More details: + https://medium.com/dev-channel/es6-modules-in-chrome-canary-m60-ba588dfb8ab7 + +- ProgressBar = "new ProgressBar('.js-progress');" +- Changelog = "new Changelog('" + SOCIAL.github + "', 'spacy');" +- NavHighlighter = "new NavHighlighter('data-section', 'data-nav');" +- GitHubEmbed = "new GitHubEmbed('" + SOCIAL.github + "', 'data-gh-embed');" +- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");" + +//- Browsers with JS module support. + Will be ignored otherwise. + +script(type="module") + | import ProgressBar from '/assets/js/progress.js'; + !=ProgressBar + if changelog + | import Changelog from '/assets/js/changelog.js'; + !=Changelog + if IS_PAGE + | import NavHighlighter from '/assets/js/nav-highlighter.js'; + !=NavHighlighter + | import GitHubEmbed from '/assets/js/github-embed.js'; + !=GitHubEmbed + if HAS_MODELS + | import { ModelLoader } from '/assets/js/models.js'; + !=ModelLoader + +//- Browsers with no JS module support. + Won't be fetched or interpreted otherwise. + +script(nomodule src="/assets/js/rollup.js") +script(nomodule) + !=ProgressBar + if changelog + !=Changelog + if IS_PAGE + !=NavHighlighter + !=GitHubEmbed + if HAS_MODELS + !=ModeLoader diff --git a/website/assets/js/changelog.js b/website/assets/js/changelog.js new file mode 100644 index 000000000..94f2149ad --- /dev/null +++ b/website/assets/js/changelog.js @@ -0,0 +1,72 @@ +'use strict'; + +import { Templater, handleResponse } from './util.js'; + +export default class Changelog { + /** + * Fetch and render changelog from GitHub. Clones a template node (table row) + * to avoid doubling templating markup in JavaScript. + * @param {string} user - GitHub username. + * @param {string} repo - Repository to fetch releases from. + */ + constructor(user, repo) { + this.url = `https://api.github.com/repos/${user}/${repo}/releases`; + this.template = new Templater('changelog'); + this.fetchChangelog() + .then(json => this.render(json)) + .catch(this.showError.bind(this)); + // make sure scroll positions for progress bar etc. are recalculated + window.dispatchEvent(new Event('resize')); + } + + fetchChangelog() { + return new Promise((resolve, reject) => + fetch(this.url) + .then(res => handleResponse(res)) + .then(json => json.ok ? resolve(json) : reject())) + } + + showError() { + this.template.get('error').style.display = 'block'; + } + + /** + * Get template section from template row. Hacky, but does make sense. + * @param {node} item - Parent element. + * @param {string} id - ID of child element, set via data-changelog. + */ + getField(item, id) { + return item.querySelector(`[data-changelog="${id}"]`); + } + + render(json) { + this.template.get('table').style.display = 'block'; + this.row = this.template.get('item'); + this.releases = this.template.get('releases'); + this.prereleases = this.template.get('prereleases'); + Object.values(json) + .filter(release => release.name) + .forEach(release => this.renderRelease(release)); + this.row.remove(); + } + + /** + * Clone the template row and populate with content from API response. + * https://developer.github.com/v3/repos/releases/#list-releases-for-a-repository + * @param {string} name - Release title. + * @param {string} tag (tag_name) - Release tag. + * @param {string} url (html_url) - URL to the release page on GitHub. + * @param {string} date (published_at) - Timestamp of release publication. + * @param {boolean} prerelease - Whether the release is a prerelease. + */ + renderRelease({ name, tag_name: tag, html_url: url, published_at: date, prerelease }) { + const container = prerelease ? this.prereleases : this.releases; + const tagLink = `${tag}`; + const title = (name.split(': ').length == 2) ? name.split(': ')[1] : name; + const row = this.row.cloneNode(true); + this.getField(row, 'date').textContent = date.split('T')[0]; + this.getField(row, 'tag').innerHTML = tagLink; + this.getField(row, 'title').textContent = title; + container.appendChild(row); + } +} diff --git a/website/assets/js/github-embed.js b/website/assets/js/github-embed.js new file mode 100644 index 000000000..58e80ee1a --- /dev/null +++ b/website/assets/js/github-embed.js @@ -0,0 +1,36 @@ +'use strict'; + +import { $$ } from './util.js'; + +export default class GitHubEmbed { + /** + * Embed code from GitHub repositories, similar to Gist embeds. Fetches the + * raw text and places it inside element. + * Usage:
+     * @param {string} user - GitHub user or organization.
+     * @param {string} attr - Data attribute used to select containers. Attribute
+     *                        value should be path to file relative to user.
+     */
+    constructor(user, attr) {
+        this.url = `https://raw.githubusercontent.com/${user}`;
+        this.attr = attr;
+        this.error = `\nCan't fetch code example from GitHub :(\n\nPlease use the link below to view the example. If you've come across\na broken link, we always appreciate a pull request to the repository,\nor a report on the issue tracker. Thanks!`;
+        [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
+    }
+
+    /**
+     * Fetch code from GitHub and insert it as element content. File path is
+     * read off the container's data attribute.
+     * @param {node} el - The element.
+     */
+    embed(el) {
+        el.parentElement.setAttribute('data-loading', '');
+        fetch(`${this.url}/${el.getAttribute(this.attr)}`)
+            .then(res => res.text().then(text => ({ text, ok: res.ok })))
+            .then(({ text, ok }) => {
+                el.textContent = ok ? text : this.error;
+                if (ok && window.Prism) Prism.highlightElement(el);
+            })
+        el.parentElement.removeAttribute('data-loading');
+    }
+}
diff --git a/website/assets/js/main.js b/website/assets/js/main.js
deleted file mode 100644
index d9465bb67..000000000
--- a/website/assets/js/main.js
+++ /dev/null
@@ -1,323 +0,0 @@
-//- 💫 MAIN JAVASCRIPT
-//- Note: Will be compiled using Babel before deployment.
-
-'use strict'
-
-const $ = document.querySelector.bind(document);
-const $$ = document.querySelectorAll.bind(document);
-
-
-class ProgressBar {
-    /**
-     * Animated reading progress bar.
-     * @param {String} selector – CSS selector of progress bar element.
-     */
-    constructor(selector) {
-        this.el = $(selector);
-        this.scrollY = 0;
-        this.sizes = this.updateSizes();
-        this.el.setAttribute('max', 100);
-        this.init();
-    }
-
-    init() {
-        window.addEventListener('scroll', () => {
-            this.scrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0);
-            requestAnimationFrame(this.update.bind(this));
-        }, false);
-        window.addEventListener('resize', () => {
-            this.sizes = this.updateSizes();
-            requestAnimationFrame(this.update.bind(this));
-        })
-    }
-
-    update() {
-        const offset = 100 - ((this.sizes.height - this.scrollY - this.sizes.vh) / this.sizes.height * 100);
-        this.el.setAttribute('value', (this.scrollY == 0) ? 0 : offset || 0);
-    }
-
-    updateSizes() {
-        const body = document.body;
-        const html = document.documentElement;
-        return {
-            height: Math.max(body.scrollHeight, body.offsetHeight, html.clientHeight, html.scrollHeight, html.offsetHeight),
-            vh: Math.max(html.clientHeight, window.innerHeight || 0)
-        }
-    }
-}
-
-
-class SectionHighlighter {
-    /**
-     * Hightlight section in viewport in sidebar, using in-view library.
-     * @param {String} sectionAttr - Data attribute of sections.
-     * @param {String} navAttr - Data attribute of navigation items.
-     * @param {String} activeClass – Class name of active element.
-     */
-    constructor(sectionAttr, navAttr, activeClass = 'is-active') {
-        this.sections = [...$$(`[${navAttr}]`)];
-        this.navAttr = navAttr;
-        this.sectionAttr = sectionAttr;
-        this.activeClass = activeClass;
-        inView(`[${sectionAttr}]`).on('enter', this.highlightSection.bind(this));
-    }
-
-    highlightSection(section) {
-        const id = section.getAttribute(this.sectionAttr);
-        const el = $(`[${this.navAttr}="${id}"]`);
-        if (el) {
-            this.sections.forEach(el => el.classList.remove(this.activeClass));
-            el.classList.add(this.activeClass);
-        }
-    }
-}
-
-
-class Templater {
-    /**
-     * Mini templating engine based on data attributes. Selects elements based
-     * on a data-tpl and data-tpl-key attribute and can set textContent
-     * and innterHtml.
-     *
-     * @param {String} templateId - Template section, e.g. value of data-tpl.
-     */
-    constructor(templateId) {
-        this.templateId = templateId;
-    }
-
-    get(key) {
-        return $(`[data-tpl="${this.templateId}"][data-tpl-key="${key}"]`);
-    }
-
-    fill(key, value, html = false) {
-        const el = this.get(key);
-        if (html) el.innerHTML = value || '';
-        else el.textContent = value || '';
-        return el;
-    }
-}
-
-
-class ModelLoader {
-    /**
-     * Load model meta from GitHub and update model details on site. Uses the
-     * Templater mini template engine to update DOM.
-     *
-     * @param {String} repo - Path tp GitHub repository containing releases.
-     * @param {Array} models - List of model IDs, e.g. "en_core_web_sm".
-     * @param {Object} licenses - License IDs mapped to URLs.
-     * @param {Object} accKeys - Available accuracy keys mapped to display labels.
-     */
-    constructor(repo, models = [], licenses = {}, benchmarkKeys = {}) {
-        this.url = `https://raw.githubusercontent.com/${repo}/master`;
-        this.repo = `https://github.com/${repo}`;
-        this.modelIds = models;
-        this.licenses = licenses;
-        this.benchKeys = benchmarkKeys;
-        this.init();
-    }
-
-    init() {
-        this.modelIds.forEach(modelId =>
-            new Templater(modelId).get('table').setAttribute('data-loading', ''));
-        fetch(`${this.url}/compatibility.json`)
-            .then(res => this.handleResponse(res))
-            .then(json => json.ok ? this.getModels(json['spacy']) : this.modelIds.forEach(modelId => this.showError(modelId)))
-    }
-
-    handleResponse(res) {
-        if (res.ok) return res.json().then(json => Object.assign({}, json, { ok: res.ok }))
-        else return ({ ok: res.ok })
-    }
-
-    convertNumber(num, separator = ',') {
-        return num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator);
-    }
-
-    getModels(compat) {
-        this.compat = compat;
-        for (let modelId of this.modelIds) {
-            const version = this.getLatestVersion(modelId, compat);
-            if (!version) {
-                this.showError(modelId); return;
-            }
-            fetch(`${this.url}/meta/${modelId}-${version}.json`)
-                .then(res => this.handleResponse(res))
-                .then(json => json.ok ? this.render(json) : this.showError(modelId))
-        }
-        // make sure scroll positions for progress bar etc. are recalculated
-        window.dispatchEvent(new Event('resize'));
-    }
-
-    showError(modelId) {
-        const template = new Templater(modelId);
-        template.get('table').removeAttribute('data-loading');
-        template.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
-            template.get(key).parentElement.parentElement.style.display = 'none';
-        }
-    }
-
-    /**
-     * Update model details in tables. Currently quite hacky :(
-     */
-    render({ lang, name, version, sources, pipeline, vectors, url, author, license, accuracy, speed, size, description, notes }) {
-        const modelId = `${lang}_${name}`;
-        const model = `${modelId}-${version}`;
-        const template = new Templater(modelId);
-
-        const getSources = s => (s instanceof Array) ? s.join(', ') : s;
-        const getPipeline = p => p.map(comp => `${comp}`).join(', ');
-        const getVectors = v => `${this.convertNumber(v.entries)} (${v.width} dimensions)`;
-        const getLink = (t, l) => `${t}`;
-
-        const keys = { version, size, description, notes }
-        Object.keys(keys).forEach(key => template.fill(key, keys[key]));
-
-        if (sources) template.fill('sources', getSources(sources));
-        if (pipeline && pipeline.length) template.fill('pipeline', getPipeline(pipeline), true);
-        else template.get('pipeline').parentElement.parentElement.style.display = 'none';
-        if (vectors) template.fill('vectors', getVectors(vectors));
-        else template.get('vectors').parentElement.parentElement.style.display = 'none';
-
-        if (author) template.fill('author', url ? getLink(author, url) : author, true);
-        if (license) template.fill('license', this.licenses[license] ? getLink(license, this.licenses[license]) : license, true);
-
-        template.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
-
-        this.renderBenchmarks(template, accuracy, speed);
-        this.renderCompat(template, modelId);
-        template.get('table').removeAttribute('data-loading');
-    }
-
-    renderBenchmarks(template, accuracy = {}, speed = {}) {
-        if (!accuracy && !speed) return;
-        template.get('benchmarks').style.display = 'block';
-        this.renderTable(template, 'parser', accuracy, val => val.toFixed(2));
-        this.renderTable(template, 'ner', accuracy, val => val.toFixed(2));
-        this.renderTable(template, 'speed', speed, Math.round);
-    }
-
-    renderTable(template, id, benchmarks, convertVal = val => val) {
-        if (!this.benchKeys[id] || !Object.keys(this.benchKeys[id]).some(key => benchmarks[key])) return;
-        const keys = Object.keys(this.benchKeys[id]).map(k => benchmarks[k] ? k : false).filter(k => k);
-        template.get(id).style.display = 'block';
-        for (let key of keys) {
-            template
-                .fill(key, this.convertNumber(convertVal(benchmarks[key])))
-                .parentElement.style.display = 'table-row';
-        }
-    }
-
-    renderCompat(template, modelId) {
-        template.get('compat-wrapper').style.display = 'table-row';
-        const options = Object.keys(this.compat).map(v => ``).join('');
-        template
-            .fill('compat', '' + options, true)
-            .addEventListener('change', ev => {
-                const result = this.compat[ev.target.value][modelId];
-                if (result) template.fill('compat-versions', `${modelId}-${result[0]}`, true);
-                else template.fill('compat-versions', '');
-            });
-    }
-
-    getLatestVersion(model, compat = {}) {
-        for (let spacy_v of Object.keys(compat)) {
-            const models = compat[spacy_v];
-            if (models[model]) return models[model][0];
-        }
-    }
-}
-
-
-class Changelog {
-    /**
-     * Fetch and render changelog from GitHub. Clones a template node (table row)
-     * to avoid doubling templating markup in JavaScript.
-     *
-     * @param {String} user - GitHub username.
-     * @param {String} repo - Repository to fetch releases from.
-     */
-    constructor(user, repo) {
-        this.url = `https://api.github.com/repos/${user}/${repo}/releases`;
-        this.template = new Templater('changelog');
-        fetch(this.url)
-            .then(res => this.handleResponse(res))
-            .then(json => json.ok ? this.render(json) : false)
-    }
-
-    /**
-     * Get template section from template row. Slightly hacky, but does make sense.
-     */
-    $(item, id) {
-        return item.querySelector(`[data-changelog="${id}"]`);
-    }
-
-    handleResponse(res) {
-        if (res.ok) return res.json().then(json => Object.assign({}, json, { ok: res.ok }))
-        else return ({ ok: res.ok })
-    }
-
-    render(json) {
-        this.template.get('error').style.display = 'none';
-        this.template.get('table').style.display = 'block';
-        this.row = this.template.get('item');
-        this.releases = this.template.get('releases');
-        this.prereleases = this.template.get('prereleases');
-        Object.values(json)
-            .filter(release => release.name)
-            .forEach(release => this.renderRelease(release));
-        this.row.remove();
-        // make sure scroll positions for progress bar etc. are recalculated
-        window.dispatchEvent(new Event('resize'));
-    }
-
-    /**
-     * Clone the template row and populate with content from API response.
-     * https://developer.github.com/v3/repos/releases/#list-releases-for-a-repository
-     *
-     * @param {String} name - Release title.
-     * @param {String} tag (tag_name) - Release tag.
-     * @param {String} url (html_url) - URL to the release page on GitHub.
-     * @param {String} date (published_at) - Timestamp of release publication.
-     * @param {Boolean} pre (prerelease) - Whether the release is a prerelease.
-     */
-    renderRelease({ name, tag_name: tag, html_url: url, published_at: date, prerelease: pre }) {
-        const container = pre ? this.prereleases : this.releases;
-        const row = this.row.cloneNode(true);
-        this.$(row, 'date').textContent = date.split('T')[0];
-        this.$(row, 'tag').innerHTML = `${tag}`;
-        this.$(row, 'title').textContent = (name.split(': ').length == 2) ? name.split(': ')[1] : name;
-        container.appendChild(row);
-    }
-}
-
-
-class GitHubEmbed {
-    /**
-     * Embed code from GitHub repositories, similar to Gist embeds. Fetches the
-     * raw text and places it inside element.
-     * Usage: 
-     *
-     * @param {String} user - GitHub user or organization.
-     * @param {String} attr - Data attribute used to select containers. Attribute
-     *                        value should be path to file relative to user.
-     */
-    constructor(user, attr) {
-        this.url = `https://raw.githubusercontent.com/${user}`;
-        this.attr = attr;
-        this.error = `\nCan't fetch code example from GitHub :(\n\nPlease use the link below to view the example. If you've come across\na broken link, we always appreciate a pull request to the repository,\nor a report on the issue tracker. Thanks!`;
-        [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
-    }
-
-    embed(el) {
-        el.parentElement.setAttribute('data-loading', '');
-        fetch(`${this.url}/${el.getAttribute(this.attr)}`)
-            .then(res => res.text().then(text => ({ text, ok: res.ok })))
-            .then(({ text, ok }) => {
-                el.textContent = ok ? text : this.error;
-                if (ok && window.Prism) Prism.highlightElement(el);
-            })
-        el.parentElement.removeAttribute('data-loading');
-    }
-}
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
new file mode 100644
index 000000000..5fe7ff54a
--- /dev/null
+++ b/website/assets/js/models.js
@@ -0,0 +1,160 @@
+'use strict';
+
+import { Templater, handleResponse, convertNumber } from './util.js';
+
+/**
+ * Chart.js defaults
+ */
+Chart.defaults.global.legend.position = 'bottom';
+Chart.defaults.global.defaultFontFamily = "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'";
+const CHART_COLORS = { model1: '#09a3d5', model2: '#066B8C' };
+
+/**
+ * Formatters for model details.
+ * @property {function} author – Format model author with optional link.
+ * @property {function} license - Format model license with optional link.
+ * @property {function} sources - Format training data sources (list or string).
+ * @property {function} pipeline - Format list of pipeline components.
+ * @property {function} vectors - Format vector data (entries and dimensions).
+ * @property {function} version - Format model version number.
+ */
+export const formats = {
+    author: (author, url) => url ? `${author}` : author,
+    license: (license, url) => url ? `${license}` : license,
+    sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
+    pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}`).join(', ') : '-',
+    vectors: vec => vec ? `${convertNumber(vec.entries)} (${vec.width} dimensions)` : 'n/a',
+    version: version => `v${version}`
+};
+
+/**
+ * Find the latest version of a model in a compatibility table.
+ * @param {string} model - The model name.
+ * @param {Object} compat - Compatibility table, keyed by spaCy version.
+ */
+export const getLatestVersion = (model, compat = {}) => {
+    for (let [spacy_v, models] of Object.entries(compat)) {
+        if (models[model]) return models[model][0];
+    }
+};
+
+export class ModelLoader {
+    /**
+     * Load model meta from GitHub and update model details on site. Uses the
+     * Templater mini template engine to update DOM.
+     * @param {string} repo - Path tp GitHub repository containing releases.
+     * @param {Array} models - List of model IDs, e.g. "en_core_web_sm".
+     * @param {Object} licenses - License IDs mapped to URLs.
+     * @param {Object} benchmarkKeys - Objects of available keys by type, e.g.
+     *                                 'parser', 'ner', 'speed', mapped to labels.
+     */
+    constructor(repo, models = [], licenses = {}, benchmarkKeys = {}) {
+        this.url = `https://raw.githubusercontent.com/${repo}/master`;
+        this.repo = `https://github.com/${repo}`;
+        this.modelIds = models;
+        this.licenses = licenses;
+        this.benchKeys = benchmarkKeys;
+        this.init();
+    }
+
+    init() {
+        this.modelIds.forEach(modelId =>
+            new Templater(modelId).get('table').setAttribute('data-loading', ''));
+        this.fetch(`${this.url}/compatibility.json`)
+            .then(json => this.getModels(json.spacy))
+            .catch(_ => this.modelIds.forEach(modelId => this.showError(modelId)));
+        // make sure scroll positions for progress bar etc. are recalculated
+        window.dispatchEvent(new Event('resize'));
+    }
+
+    fetch(url) {
+        return new Promise((resolve, reject) =>
+            fetch(url).then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(json) : reject()))
+    }
+
+    getModels(compat) {
+        this.compat = compat;
+        for (let modelId of this.modelIds) {
+            const version = getLatestVersion(modelId, compat);
+            if (version) this.fetch(`${this.url}/meta/${modelId}-${version}.json`)
+                .then(json => this.render(json))
+                .catch(_ => this.showError(modelId))
+            else this.showError(modelId);
+        }
+    }
+
+    showError(modelId) {
+        const tpl = new Templater(modelId);
+        tpl.get('table').removeAttribute('data-loading');
+        tpl.get('error').style.display = 'block';
+        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+            tpl.get(key).parentElement.parentElement.style.display = 'none';
+        }
+    }
+
+    /**
+     * Update model details in tables. Currently quite hacky :(
+     */
+    render(data) {
+        const modelId = `${data.lang}_${data.name}`;
+        const model = `${modelId}-${data.version}`;
+        const tpl = new Templater(modelId);
+        this.renderDetails(tpl, data)
+        this.renderBenchmarks(tpl, data.accuracy, data.speed);
+        this.renderCompat(tpl, modelId);
+        tpl.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
+        tpl.get('table').removeAttribute('data-loading');
+    }
+
+    renderDetails(tpl, { version, size, description, notes, author, url,
+        license, sources, vectors, pipeline }) {
+        const basics = { version, size, description, notes }
+        for (let [key, value] of Object.entries(basics)) {
+            if (value) tpl.fill(key, value);
+        }
+        if (author) tpl.fill('author', formats.author(author, url), true);
+        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
+        if (sources) tpl.fill('sources', formats.sources(sources));
+        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
+        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
+        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
+    }
+
+    renderBenchmarks(tpl, accuracy = {}, speed = {}) {
+        if (!accuracy && !speed) return;
+        this.renderTable(tpl, 'parser', accuracy, val => val.toFixed(2));
+        this.renderTable(tpl, 'ner', accuracy, val => val.toFixed(2));
+        this.renderTable(tpl, 'speed', speed, Math.round);
+        tpl.get('benchmarks').style.display = 'block';
+    }
+
+    renderTable(tpl, id, benchmarks, converter = val => val) {
+        if (!this.benchKeys[id] || !Object.keys(this.benchKeys[id]).some(key => benchmarks[key])) return;
+        for (let key of Object.keys(this.benchKeys[id])) {
+            if (benchmarks[key]) tpl
+                .fill(key, convertNumber(converter(benchmarks[key])))
+                .parentElement.style.display = 'table-row';
+        }
+        tpl.get(id).style.display = 'block';
+    }
+
+    renderCompat(tpl, modelId) {
+        tpl.get('compat-wrapper').style.display = 'table-row';
+        const header = '';
+        const options = Object.keys(this.compat)
+            .map(v => ``)
+            .join('');
+        tpl
+            .fill('compat', header + options, true)
+            .addEventListener('change', ({ target: { value }}) =>
+                tpl.fill('compat-versions', this.getCompat(value, modelId), true))
+    }
+
+    getCompat(version, model) {
+        const res = this.compat[version][model];
+        return res ? `${model}-${res[0]}` : 'not compatible';
+    }
+}
+
diff --git a/website/assets/js/nav-highlighter.js b/website/assets/js/nav-highlighter.js
new file mode 100644
index 000000000..40f708e5e
--- /dev/null
+++ b/website/assets/js/nav-highlighter.js
@@ -0,0 +1,33 @@
+'use strict';
+
+import { $, $$ } from './util.js';
+
+export default class NavHighlighter {
+    /**
+     * Hightlight section in viewport in sidebar, using in-view library.
+     * @param {string} sectionAttr - Data attribute of sections.
+     * @param {string} navAttr - Data attribute of navigation items.
+     * @param {string} activeClass – Class name of active element.
+     */
+    constructor(sectionAttr, navAttr, activeClass = 'is-active') {
+        this.sections = [...$$(`[${navAttr}]`)];
+        this.navAttr = navAttr;
+        this.sectionAttr = sectionAttr;
+        this.activeClass = activeClass;
+        if (window.inView) inView(`[${sectionAttr}]`)
+            .on('enter', this.highlightSection.bind(this));
+    }
+
+    /**
+     * Check if section in view exists in sidebar and mark as active.
+     * @param {node} section - The section in view.
+     */
+    highlightSection(section) {
+        const id = section.getAttribute(this.sectionAttr);
+        const el = $(`[${this.navAttr}="${id}"]`);
+        if (el) {
+            this.sections.forEach(el => el.classList.remove(this.activeClass));
+            el.classList.add(this.activeClass);
+        }
+    }
+}
diff --git a/website/assets/js/progress.js b/website/assets/js/progress.js
new file mode 100644
index 000000000..1497547d8
--- /dev/null
+++ b/website/assets/js/progress.js
@@ -0,0 +1,52 @@
+'use strict';
+
+import { $ } from './util.js';
+
+export default class ProgressBar {
+    /**
+     * Animated reading progress bar.
+     * @param {string} selector – CSS selector of progress bar element.
+     */
+    constructor(selector) {
+        this.scrollY = 0;
+        this.sizes = this.updateSizes();
+        this.el = $(selector);
+        this.el.setAttribute('max', 100);
+        window.addEventListener('scroll', this.onScroll.bind(this));
+        window.addEventListener('resize', this.onResize.bind(this));
+    }
+
+    onScroll(ev) {
+        this.scrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0);
+        requestAnimationFrame(this.update.bind(this));
+    }
+
+    onResize(ev) {
+        this.sizes = this.updateSizes();
+        requestAnimationFrame(this.update.bind(this));
+    }
+
+    update() {
+        const offset = 100 - ((this.sizes.height - this.scrollY - this.sizes.vh) / this.sizes.height * 100);
+        this.el.setAttribute('value', (this.scrollY == 0) ? 0 : offset || 0);
+    }
+
+    /**
+     * Update scroll and viewport height. Called on load and window resize.
+     */
+    updateSizes() {
+        return {
+            height: Math.max(
+                document.body.scrollHeight,
+                document.body.offsetHeight,
+                document.documentElement.clientHeight,
+                document.documentElement.scrollHeight,
+                document.documentElement.offsetHeight
+            ),
+            vh: Math.max(
+                document.documentElement.clientHeight,
+                window.innerHeight || 0
+            )
+        }
+    }
+}
diff --git a/website/assets/js/rollup.js b/website/assets/js/rollup.js
new file mode 100644
index 000000000..00ff92fa9
--- /dev/null
+++ b/website/assets/js/rollup.js
@@ -0,0 +1,23 @@
+/**
+ * This file is bundled by Rollup, compiled with Babel and included as
+ *