diff --git a/.github/contributors/jimregan.md b/.github/contributors/jimregan.md new file mode 100644 index 000000000..dd8fe3d64 --- /dev/null +++ b/.github/contributors/jimregan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jim O'Regan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-06-24 | +| GitHub username | jimregan | +| Website (optional) | | diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py new file mode 100644 index 000000000..38b73468f --- /dev/null +++ b/spacy/lang/ga/__init__.py @@ -0,0 +1,25 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class IrishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ga' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + +class Irish(Language): + lang = 'ga' + Defaults = IrishDefaults + + +__all__ = ['Irish'] diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py new file mode 100644 index 000000000..383e24efc --- /dev/null +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -0,0 +1,33 @@ +# coding: utf8 +from __future__ import unicode_literals + +class IrishMorph: + consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] + broad_vowels = ['a', 'á', 'o', 'ó', 'u', 'ú'] + slender_vowels = ['e', 'é', 'i', 'í'] + vowels = broad_vowels + slender_vowels + + def ends_dentals(word): + if word != "" and word[-1] in ['d', 'n', 't', 's']: + return True + else: + return False + + def devoice(word): + if len(word) > 2 and word[-2] == 's' and word[-1] == 'd': + return word[:-1] + 't' + else: + return word + + def ends_with_vowel(word): + return word != "" and word[-1] in vowels + + def starts_with_vowel(word): + return word != "" and word[0] in vowels + + def deduplicate(word): + if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: + return word[:-1] + else: + return word + diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py new file mode 100644 index 000000000..816c00b13 --- /dev/null +++ b/spacy/lang/ga/stop_words.py @@ -0,0 +1,45 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a ach ag agus an aon ar arna as + +ba beirt bhúr + +caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear + +daichead dar de deich deichniúr den dhá do don dtí dá dár dó + +faoi faoin faoina faoinár fara fiche + +gach gan go gur + +haon hocht + +i iad idir in ina ins inár is + +le leis lena lenár + +mar mo muid mé + +na nach naoi naonúr ná ní níor nó nócha + +ocht ochtar ochtó os + +roimh + +sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí + +tar thar thú triúr trí trína trínár tríocha tú + +um + +ár + +é éis + +í + +ó ón óna ónár +""".split()) diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py new file mode 100644 index 000000000..22a6bacd0 --- /dev/null +++ b/spacy/lang/ga/tag_map.py @@ -0,0 +1,368 @@ +# coding: utf8 +from __future__ import unicode_literals + + +TAG_MAP = { + "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}}, + "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}}, + "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, + "ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}}, + "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}}, + "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}}, + "ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"}, + "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}}, + "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}}, + "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}}, + "ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"}, + "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, + "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, + "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, + "ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, + "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, + "ADJ__Number=Plur|Case=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, + "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"}, + "ADJ___": {"pos": "ADJ"}, + "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, + "ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"}, + "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3}, + "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, + "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Gender=Masc|Number=Sing|Person=3": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3}, + "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, + "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, + "ADP__Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1}, + "ADP__Number=Plur|Person=1|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 1, "Poss": "yes"}, + "ADP__Number=Plur|Person=1|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 1, "PronType": "emp"}, + "ADP__Number=Plur|Person=2": {"pos": "ADP", "Number": "plur", "Person": 2}, + "ADP__Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3}, + "ADP__Number=Plur|Person=3|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes"}, + "ADP__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes", "PronType": "prs"}, + "ADP__Number=Plur|Person=3|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 3, "PronType": "emp"}, + "ADP__Number=Plur|PronType=Art": {"pos": "ADP", "Number": "plur", "PronType": "art"}, + "ADP__Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1}, + "ADP__Number=Sing|Person=1|Poss=Yes": {"pos": "ADP", "Number": "sing", "Person": 1, "Poss": "yes"}, + "ADP__Number=Sing|Person=1|PronType=Emp": {"pos": "ADP", "Number": "sing", "Person": 1, "PronType": "emp"}, + "ADP__Number=Sing|Person=2": {"pos": "ADP", "Number": "sing", "Person": 2}, + "ADP__Number=Sing|Person=3": {"pos": "ADP", "Number": "sing", "Person": 3}, + "ADP__Number=Sing|PronType=Art": {"pos": "ADP", "Number": "sing", "PronType": "art"}, + "ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"}, + "ADP___": {"pos": "ADP"}, + "ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"}, + "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}}, + "ADP__PronType=Art": {"pos": "ADP", "PronType": "art"}, + "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}}, + "ADV___": {"pos": "ADV"}, + "ADV__PronType=Int": {"pos": "ADV", "PronType": "int"}, + "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}}, + "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}}, + "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, + "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX___": {"pos": "AUX"}, + "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}}, + "CCONJ___": {"pos": "CCONJ"}, + "DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}}, + "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"}, + "DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"}, + "DET__Definite=Def": {"pos": "DET", "Definite": "def"}, + "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}}, + "DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, + "DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, + "DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"}, + "DET__Number=Plur|Person=3|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 3, "Poss": "yes"}, + "DET__Number=Sing|Person=1|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 1, "Poss": "yes"}, + "DET__Number=Sing|Person=2|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 2, "Poss": "yes"}, + "DET__Number=Sing|PronType=Int": {"pos": "DET", "Number": "sing", "PronType": "int"}, + "DET___": {"pos": "DET"}, + "DET__PronType=Dem": {"pos": "DET", "PronType": "dem"}, + "DET__PronType=Ind": {"pos": "DET", "PronType": "ind"}, + "NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"}, + "NOUN__Case=Gen|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Fem": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, + "NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"}, + "NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"}, + "NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"}, + "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}}, + "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}}, + "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}}, + "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}}, + "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"}, + "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}}, + "NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"}, + "NOUN___": {"pos": "NOUN"}, + "NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"}, + "NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"}, + "NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"}, + "NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"}, + "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}}, + "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}}, + "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}}, + "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}}, + "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}}, + "NUM__NumType=Card": {"pos": "NUM", "NumType": "card"}, + "NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"}, + "NUM___": {"pos": "NUM"}, + "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}}, + "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}}, + "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}}, + "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}}, + "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}}, + "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}}, + "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}}, + "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}}, + "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}}, + "PART__PartType=Inf": {"pos": "PART", "PartType": "inf"}, + "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}}, + "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}}, + "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}}, + "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}}, + "PART___": {"pos": "PART"}, + "PART__PronType=Rel": {"pos": "PART", "PronType": "rel"}, + "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, + "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}}, + "PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3}, + "PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3}, + "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, + "PRON__Gender=Masc|Person=3": {"pos": "PRON", "Gender": "masc", "Person": 3}, + "PRON__Number=Plur|Person=1": {"pos": "PRON", "Number": "plur", "Person": 1}, + "PRON__Number=Plur|Person=1|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 1, "PronType": "emp"}, + "PRON__Number=Plur|Person=2": {"pos": "PRON", "Number": "plur", "Person": 2}, + "PRON__Number=Plur|Person=3": {"pos": "PRON", "Number": "plur", "Person": 3}, + "PRON__Number=Plur|Person=3|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 3, "PronType": "emp"}, + "PRON__Number=Sing|Person=1": {"pos": "PRON", "Number": "sing", "Person": 1}, + "PRON__Number=Sing|Person=1|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 1, "PronType": "emp"}, + "PRON__Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2}, + "PRON__Number=Sing|Person=2|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 2, "PronType": "emp"}, + "PRON__Number=Sing|Person=3": {"pos": "PRON", "Number": "sing", "Person": 3}, + "PRON__Number=Sing|PronType=Int": {"pos": "PRON", "Number": "sing", "PronType": "int"}, + "PRON__PronType=Dem": {"pos": "PRON", "PronType": "dem"}, + "PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"}, + "PRON__PronType=Int": {"pos": "PRON", "PronType": "int"}, + "PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"}, + "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}}, + "PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, + "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, + "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, + "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"}, + "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, + "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, + "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, + "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, + "PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, + "PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, + "PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"}, + "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"}, + "PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"}, + "PROPN___": {"pos": "PROPN"}, + "PUNCT___": {"pos": "PUNCT"}, + "SCONJ___": {"pos": "SCONJ"}, + "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}}, + "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}}, + "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}}, + "VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"}, + "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"}, + "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}}, + "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}}, + "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}}, + "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, + "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, + "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, + "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}}, + "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}}, + "VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3}, + "VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1}, + "VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"}, + "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}}, + "VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"}, + "VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1}, + "VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2}, + "VERB__Mood=Imp|Number=Sing|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 2}, + "VERB__Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past"}, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past"}, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"}, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"}, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"}, + "VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"}, + "VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"}, + "VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"}, + "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"}, + "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"}, + "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}}, + "VERB___": {"pos": "VERB"}, + "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}}, + "X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"}, + "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}}, + "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}}, + "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}}, + "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}}, + "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}}, + "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}}, + "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}}, + "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, + "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, + "X___": {"pos": "X"} +} diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py new file mode 100644 index 000000000..e93ada52f --- /dev/null +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -0,0 +1,86 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX +from ...symbols import ORTH, LEMMA, NORM + + +_exc = { + "'acha'n": [ + {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, + {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}], + + "dem'": [ + {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}], + + "ded'": [ + {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP}, + {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET}], + + "lem'": [ + {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, + {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}], + + "led'": [ + {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP}, + {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}] +} + +for exc_data in [ + {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ}, + {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, + {ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, + {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV}, + {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV}, + {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV}, + {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV}, + {ORTH: "m'", LEMMA: "mo", POS: DET}, + {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, + {ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, + {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, + {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X}, + {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV}, + {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN}, + {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, + {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, + {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV}, + {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, + {ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, + {ORTH: "Co.", LEMMA: "contae", POS: NOUN}, + {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, + {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, + {ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, + {ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, + {ORTH: "B'", LEMMA: "ba", POS: AUX}, + {ORTH: "b'", LEMMA: "ba", POS: AUX}, + {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN}, + {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN}, + {ORTH: "Már.", LEMMA: "Márta", POS: NOUN}, + {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN}, + {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN}, + {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN}, + {ORTH: "tAth.", LEMMA: "athair", POS: NOUN}, + {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN}, + {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN}, + {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN}, + {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, + {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, + {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}]: + _exc[exc_data[ORTH]] = [exc_data] + +for orth in [ + "d'", "D'"]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index bd9bbf2ea..ee4093db3 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -14,9 +14,8 @@ from .. import util # These languages are used for generic tokenizer tests – only add a language # here if it's using spaCy's tokenizer (not a different library) # TODO: re-implement generic tokenizer tests -_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', +_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] - _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_depvec_web_lg'], @@ -108,6 +107,11 @@ def bn_tokenizer(): return util.get_lang_class('bn').Defaults.create_tokenizer() +@pytest.fixture +def ga_tokenizer(): + return util.get_lang_class('ga').Defaults.create_tokenizer() + + @pytest.fixture def he_tokenizer(): return util.get_lang_class('he').Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ga/__init__.py b/spacy/tests/lang/ga/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py new file mode 100644 index 000000000..85c44f3ef --- /dev/null +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -0,0 +1,17 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +GA_TOKEN_EXCEPTION_TESTS = [ + ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']), + ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) +] + + +@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS) +def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): + tokens = ga_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list