From 9f740a9891d6c118eeb154dd819dba58d93db8ac Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Feb 2020 14:59:03 +0100 Subject: [PATCH 001/131] Add a few more Danish tokenizer exceptions --- spacy/lang/da/tokenizer_exceptions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index d669fb981..89b083186 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -70,6 +70,7 @@ for orth in [ "A/S", "B.C.", "BK.", + "B.T.", "Dr.", "Boul.", "Chr.", @@ -79,6 +80,7 @@ for orth in [ "Hf.", "i/s", "I/S", + "Inc.", "Kprs.", "L.A.", "Ll.", @@ -149,6 +151,7 @@ for orth in [ "bygn.", "c/o", "ca.", + "cm.", "cand.", "d.d.", "d.m.", @@ -172,10 +175,12 @@ for orth in [ "dl.", "do.", "dobb.", + "dr.", "dr.h.c", "dr.phil.", "ds.", "dvs.", + "d.v.s.", "e.b.", "e.l.", "e.o.", @@ -297,10 +302,14 @@ for orth in [ "kap.", "kbh.", "kem.", + "kg.", + "kgs.", "kgl.", "kl.", "kld.", + "km.", "km/t", + "km/t.", "knsp.", "komm.", "kons.", @@ -311,6 +320,7 @@ for orth in [ "kt.", "ktr.", "kv.", + "kvm.", "kvt.", "l.c.", "lab.", @@ -357,6 +367,7 @@ for orth in [ "nto.", "nuv.", "o/m", + "o/m.", "o.a.", "o.fl.", "o.h.", @@ -526,6 +537,7 @@ for orth in [ "vejl.", "vh.", "vha.", + "vind.", "vs.", "vsa.", "vær.", From 3b53617a69287c45284d0aedc4c7fefcaa631662 Mon Sep 17 00:00:00 2001 From: Baciccin Date: Thu, 19 Mar 2020 21:20:17 -0700 Subject: [PATCH 002/131] Add Ligurian language --- .github/contributors/Baciccin.md | 106 +++++++++++++++++++++++++ spacy/lang/lij/__init__.py | 31 ++++++++ spacy/lang/lij/examples.py | 18 +++++ spacy/lang/lij/punctuation.py | 15 ++++ spacy/lang/lij/stop_words.py | 43 ++++++++++ spacy/lang/lij/tokenizer_exceptions.py | 52 ++++++++++++ website/meta/languages.json | 6 ++ 7 files changed, 271 insertions(+) create mode 100644 .github/contributors/Baciccin.md create mode 100644 spacy/lang/lij/__init__.py create mode 100644 spacy/lang/lij/examples.py create mode 100644 spacy/lang/lij/punctuation.py create mode 100644 spacy/lang/lij/stop_words.py create mode 100644 spacy/lang/lij/tokenizer_exceptions.py diff --git a/.github/contributors/Baciccin.md b/.github/contributors/Baciccin.md new file mode 100644 index 000000000..c7a940cb5 --- /dev/null +++ b/.github/contributors/Baciccin.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Giovanni Battista Parodi | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-19 | +| GitHub username | Baciccin | +| Website (optional) | | diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py new file mode 100644 index 000000000..9b4b29798 --- /dev/null +++ b/spacy/lang/lij/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class LigurianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lij" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + + +class Ligurian(Language): + lang = "lij" + Defaults = LigurianDefaults + + +__all__ = ["Ligurian"] diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py new file mode 100644 index 000000000..c4034ae7e --- /dev/null +++ b/spacy/lang/lij/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.lij.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Sciusciâ e sciorbî no se peu.", + "Graçie di çetroin, che me son arrivæ.", + "Vegnime apreuvo, che ve fasso pescâ di òmmi.", + "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.", +] diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py new file mode 100644 index 000000000..4439376c8 --- /dev/null +++ b/spacy/lang/lij/punctuation.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import ALPHA + + +ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") + + +_infixes = TOKENIZER_INFIXES + [ + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) +] + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py new file mode 100644 index 000000000..7ab34adf1 --- /dev/null +++ b/spacy/lang/lij/stop_words.py @@ -0,0 +1,43 @@ +# coding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set( + """ +a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei + +bella belle belli bello ben + +ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse + +d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo + +é e ê ea ean emmo en ëse + +fin fiña + +gh' ghe guæei + +i î in insemme int' inta inte inti into + +l' lê lì lô + +m' ma manco me megio meno mezo mi + +na n' ne ni ninte nisciun nisciuña no + +o ò ô oua + +parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio + +quæ quand' quande quarche quella quelle quelli quello + +s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto + +tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto + +un uña unn' unna + +za zu +""".split() +) diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py new file mode 100644 index 000000000..2aa6f8304 --- /dev/null +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -0,0 +1,52 @@ +# coding: utf8 +from __future__ import unicode_literals +from ...symbols import ORTH, LEMMA + +_exc = {} + +for raw, lemma in [ + ("a-a", "a-o"), + ("a-e", "a-o"), + ("a-o", "a-o"), + ("a-i", "a-o"), + ("co-a", "co-o"), + ("co-e", "co-o"), + ("co-i", "co-o"), + ("co-o", "co-o"), + ("da-a", "da-o"), + ("da-e", "da-o"), + ("da-i", "da-o"), + ("da-o", "da-o"), + ("pe-a", "pe-o"), + ("pe-e", "pe-o"), + ("pe-i", "pe-o"), + ("pe-o", "pe-o"), +]: + for orth in [raw, raw.capitalize()]: + _exc[orth] = [{ORTH: orth, LEMMA: lemma}] + +# Prefix + prepositions with à (e.g. "sott'a-o") + +for prep, prep_lemma in [ + ("a-a", "a-o"), + ("a-e", "a-o"), + ("a-o", "a-o"), + ("a-i", "a-o"), +]: + for prefix, prefix_lemma in [ + ("sott'", "sotta"), + ("sott’", "sotta"), + ("contr'", "contra"), + ("contr’", "contra"), + ("ch'", "che"), + ("ch’", "che"), + ("s'", "se"), + ("s’", "se"), + ]: + for prefix_orth in [prefix, prefix.capitalize()]: + _exc[prefix_orth+prep] = [ + {ORTH: prefix_orth, LEMMA: prefix_lemma}, + {ORTH: prep, LEMMA: prep_lemma}, + ] + +TOKENIZER_EXCEPTIONS = _exc diff --git a/website/meta/languages.json b/website/meta/languages.json index 8834aaddc..41c1bce7f 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -181,6 +181,12 @@ "name": "Vietnamese", "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] }, + { + "code": "lij", + "name": "Ligurian", + "example": "Sta chì a l'é unna fraxe.", + "has_examples": true + }, { "code": "xx", "name": "Multi-language", From b52e1ab677c61c20f7b6985461a78193f4c7a8bb Mon Sep 17 00:00:00 2001 From: nlptechbook <60931109+nlptechbook@users.noreply.github.com> Date: Sat, 21 Mar 2020 11:39:15 -0400 Subject: [PATCH 003/131] Update universe.json A bot powered by Clarifai Predict API and spaCy. Can be found in Telegram messenger at @pic2phrase_bot --- website/meta/universe.json | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 23d052bb9..8f8bcfecd 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1619,6 +1619,19 @@ }, "category": ["standalone", "research"] }, + { + "id": "pic2phrase_bot", + "title": "pic2phrase_bot: Photo Description Generator", + "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", + "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy." + "thumb": "https://drive.google.com/open?id=1GTrpPzc8j4mAmYCJZibYrADAp0GWcVHd", + "image": "https://drive.google.com/open?id=1t7URKJ-4uOJmZb_GbNvw-F5LLtvEoBRy", + "author": "Yuli Vasiliev", + "author_links": { + "twitter": "VasilievYuli", + }, + "category": ["standalone", "research"] + }, { "id": "gracyql", "title": "gracyql", From 2897a73559ca1663d0e258604686e0134b9095d0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Mar 2020 19:23:47 +0100 Subject: [PATCH 004/131] Improve German tokenizer settings style --- spacy/lang/de/punctuation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index c376ce597..da6ab1d40 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import _prefixes, _suffixes +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES -_prefixes = ["``",] + list(_prefixes) +_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES _suffixes = ( ["''", "/"] From 79737adb90f286ca5b9be6e1020ea5b1855eed58 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 2 Dec 2019 13:48:27 +0100 Subject: [PATCH 005/131] Improved tokenization for UD_Norwegian-Bokmaal --- spacy/lang/nb/__init__.py | 5 ++ spacy/lang/nb/punctuation.py | 50 ++++++++++++--- spacy/lang/nb/tokenizer_exceptions.py | 92 +++++++++++++++++++-------- 3 files changed, 109 insertions(+), 38 deletions(-) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 086761f82..e6c58b7de 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES from .syntax_iterators import SYNTAX_ITERATORS @@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults): Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + prefixes = TOKENIZER_PREFIXES + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS morph_rules = MORPH_RULES tag_map = TAG_MAP diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index b49aa9838..7672809ec 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -1,16 +1,33 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import TOKENIZER_SUFFIXES +from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY -# Punctuation stolen from Danish + +# Punctuation adapted from Danish _quotes = CONCAT_QUOTES.replace("'", "") +_list_punct = [x for x in LIST_PUNCT if x != "#"] +_list_icons = [x for x in LIST_ICONS if x != "°"] +_list_icons = [x.replace("\\u00B0", "") for x in _list_icons] +_list_quotes = [x for x in LIST_QUOTES if x != "\\'"] + + +_prefixes = ( + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + + _list_punct + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) + + _infixes = ( LIST_ELLIPSES - + LIST_ICONS + + _list_icons + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), @@ -21,13 +38,26 @@ _infixes = ( ] ) -_suffixes = [ - suffix - for suffix in TOKENIZER_SUFFIXES - if suffix not in ["'s", "'S", "’s", "’S", r"\'"] -] -_suffixes += [r"(?<=[^sSxXzZ])\'"] +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + _list_quotes + + _list_icons + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] + + [r"(?<=[^sSxXzZ])'"] +) +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 92ac09841..3f4aa79f6 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -24,57 +24,80 @@ for exc_data in [ for orth in [ - "adm.dir.", - "a.m.", - "andelsnr", + "Ap.", "Aq.", + "Ca.", + "Chr.", + "Co.", + "Co.", + "Dr.", + "F.eks.", + "Fr.p.", + "Frp.", + "Grl.", + "Kr.", + "Kr.F.", + "Kr.F.s", + "Mr.", + "Mrs.", + "Pb.", + "Pr.", + "Sp.", + "Sp.", + "St.", + "a.m.", + "ad.", + "adm.dir.", + "andelsnr", "b.c.", "bl.a.", "bla.", "bm.", "bnr.", "bto.", + "c.c.", "ca.", "cand.mag.", - "c.c.", "co.", "d.d.", - "dept.", "d.m.", - "dr.philos.", - "dvs.", "d.y.", - "E. coli", + "dept.", + "dr.", + "dr.med.", + "dr.philos.", + "dr.psychol.", + "dvs.", + "e.Kr.", + "e.l.", "eg.", "ekskl.", - "e.Kr.", "el.", - "e.l.", "et.", "etc.", "etg.", "ev.", "evt.", "f.", + "f.Kr.", "f.eks.", + "f.o.m.", "fhv.", "fk.", - "f.Kr.", - "f.o.m.", "foreg.", "fork.", "fv.", "fvt.", "g.", - "gt.", "gl.", "gno.", "gnr.", "grl.", + "gt.", + "h.r.adv.", "hhv.", "hoh.", "hr.", - "h.r.adv.", "ifb.", "ifm.", "iht.", @@ -83,39 +106,45 @@ for orth in [ "jf.", "jr.", "jun.", + "juris.", "kfr.", + "kgl.", "kgl.res.", "kl.", "komm.", "kr.", "kst.", + "lat.", "lø.", + "m.a.o.", + "m.fl.", + "m.m.", + "m.v.", "ma.", "mag.art.", - "m.a.o.", "md.", "mfl.", + "mht.", "mill.", "min.", - "m.m.", "mnd.", "moh.", - "Mr.", + "mrd.", "muh.", "mv.", "mva.", + "n.å.", "ndf.", "no.", "nov.", "nr.", "nto.", "nyno.", - "n.å.", "o.a.", + "o.l.", "off.", "ofl.", "okt.", - "o.l.", "on.", "op.", "org.", @@ -123,14 +152,15 @@ for orth in [ "ovf.", "p.", "p.a.", - "Pb.", + "p.g.a.", + "p.m.", + "p.t.", "pga.", "ph.d.", "pkt.", - "p.m.", "pr.", "pst.", - "p.t.", + "pt.", "red.anm.", "ref.", "res.", @@ -139,6 +169,10 @@ for orth in [ "rv.", "s.", "s.d.", + "s.k.", + "s.k.", + "s.u.", + "s.å.", "sen.", "sep.", "siviling.", @@ -148,16 +182,17 @@ for orth in [ "sr.", "sst.", "st.", - "stip.", - "stk.", "st.meld.", "st.prp.", + "stip.", + "stk.", "stud.", - "s.u.", "sv.", - "sø.", - "s.å.", "såk.", + "sø.", + "t.h.", + "t.o.m.", + "t.v.", "temp.", "ti.", "tils.", @@ -165,7 +200,6 @@ for orth in [ "tl;dr", "tlf.", "to.", - "t.o.m.", "ult.", "utg.", "v.", @@ -179,8 +213,10 @@ for orth in [ "vol.", "vs.", "vsa.", + "©NTB", "årg.", "årh.", + "§§", ]: _exc[orth] = [{ORTH: orth}] From cba2d1d972239bae86fcd5a0b3bd5e8ede04af9c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 25 Mar 2020 09:39:26 +0100 Subject: [PATCH 006/131] Disable failing abbreviation test UD_Danish-DDT has (as far as I can tell) hallucinated periods after abbreviations, so the changes are an artifact of the corpus and not due to anything meaningful about Danish tokenization. --- spacy/tests/lang/da/test_exceptions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index a522ab5e8..f98030621 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -58,7 +58,8 @@ def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): ("Kristiansen c/o Madsen", 3), ("Sprogteknologi a/s", 2), ("De boede i A/B Bellevue", 5), - ("Rotorhastigheden er 3400 o/m.", 5), + # note: skipping due to weirdness in UD_Danish-DDT + #("Rotorhastigheden er 3400 o/m.", 5), ("Jeg købte billet t/r.", 5), ("Murerarbejdsmand m/k søges", 3), ("Netværket kører over TCP/IP", 4), From 4117a5c7056a65aafb29db137b4f52b264d915fc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:27:42 +0100 Subject: [PATCH 007/131] Improve French tokenization (#5202) Improve French tokenization for UD_French-Sequoia. --- spacy/lang/fr/__init__.py | 4 ++- spacy/lang/fr/punctuation.py | 19 +++++++--- spacy/lang/fr/tokenizer_exceptions.py | 50 +++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f56c8688a..7727aff0e 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH -from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES token_match = TOKEN_MATCH diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..e03e91361 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,15 +1,24 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES +from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import merge_chars -ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") +ELISION = "' ’".replace(" ", "") +HYPHENS = r"- – — ‐ ‑".replace(" ", "") +_prefixes_elision = "d l n" +_prefixes_elision += " " + _prefixes_elision.upper() +_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous" +_hyphen_suffixes += " " + _hyphen_suffixes.upper() +_prefixes = TOKENIZER_PREFIXES + [ + r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)) +] + _suffixes = ( LIST_PUNCT + LIST_ELLIPSES @@ -17,7 +26,6 @@ _suffixes = ( + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."] - r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"] r"(?<=[0-9])%", # 4% -> ["4", "%"] r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), @@ -25,14 +33,15 @@ _suffixes = ( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)), ] ) - _infixes = TOKENIZER_INFIXES + [ r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) ] +TOKENIZER_PREFIXES = _prefixes TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4b3b2c908..56c5544a5 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -9,7 +9,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA, TAG # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer -# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS +#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"] @@ -56,7 +56,28 @@ for exc_data in [ _exc[exc_data[ORTH]] = [exc_data] -for orth in ["etc."]: +for orth in [ + "après-midi", + "au-delà", + "au-dessus", + "celle-ci", + "celles-ci", + "celui-ci", + "cf.", + "ci-dessous", + "elle-même", + "en-dessous", + "etc.", + "jusque-là", + "lui-même", + "MM.", + "No.", + "peut-être", + "pp.", + "quelques-uns", + "rendez-vous", + "Vol.", +]: _exc[orth] = [{ORTH: orth}] @@ -72,7 +93,7 @@ for verb, verb_lemma in [ for pronoun in ["elle", "il", "on"]: token = "{}-t-{}".format(orth, pronoun) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, {LEMMA: pronoun, ORTH: "-" + pronoun}, ] @@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: token = "{}-ce".format(orth) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, ] @@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]: for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for orth in [pre, pre.title()]: _exc["%sest-ce" % orth] = [ - {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, - {LEMMA: "être", ORTH: "est", TAG: "VERB"}, + {LEMMA: pre_lemma, ORTH: orth}, + {LEMMA: "être", ORTH: "est"}, {LEMMA: "ce", ORTH: "-ce"}, ] +for verb, pronoun in [("est", "il"), ("EST", "IL")]: + token = "{}-{}".format(verb, pronoun) + _exc[token] = [ + {LEMMA: "être", ORTH: verb}, + {LEMMA: pronoun, ORTH: "-" + pronoun}, + ] + + +for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: + token = "{}'{}-{}".format(s, verb, pronoun) + _exc[token] = [ + {LEMMA: "se", ORTH: s + "'"}, + {LEMMA: "être", ORTH: verb}, + {LEMMA: pronoun, ORTH: "-" + pronoun}, + ] + + _infixes_exc = [] orig_elision = "'" orig_hyphen = "-" From 923a453449d7bc236e72ba23286845aba5ab3fe3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:27:53 +0100 Subject: [PATCH 008/131] Modifications/updates to Portuguese tokenization (#5203) Modifications to Portuguese tokenization for UD_Portuguese-Bosque. Instead of splitting contactions as exceptions, they are kept as merged tokens. --- spacy/lang/pt/tokenizer_exceptions.py | 60 +++++++++------------------ 1 file changed, 19 insertions(+), 41 deletions(-) diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 5169780e6..c36af6771 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -4,69 +4,47 @@ from __future__ import unicode_literals from ...symbols import ORTH, NORM -_exc = { - "às": [{ORTH: "à", NORM: "a"}, {ORTH: "s", NORM: "as"}], - "ao": [{ORTH: "a"}, {ORTH: "o"}], - "aos": [{ORTH: "a"}, {ORTH: "os"}], - "àquele": [{ORTH: "à", NORM: "a"}, {ORTH: "quele", NORM: "aquele"}], - "àquela": [{ORTH: "à", NORM: "a"}, {ORTH: "quela", NORM: "aquela"}], - "àqueles": [{ORTH: "à", NORM: "a"}, {ORTH: "queles", NORM: "aqueles"}], - "àquelas": [{ORTH: "à", NORM: "a"}, {ORTH: "quelas", NORM: "aquelas"}], - "àquilo": [{ORTH: "à", NORM: "a"}, {ORTH: "quilo", NORM: "aquilo"}], - "aonde": [{ORTH: "a"}, {ORTH: "onde"}], -} - - -# Contractions -_per_pron = ["ele", "ela", "eles", "elas"] -_dem_pron = [ - "este", - "esta", - "estes", - "estas", - "isto", - "esse", - "essa", - "esses", - "essas", - "isso", - "aquele", - "aquela", - "aqueles", - "aquelas", - "aquilo", -] -_und_pron = ["outro", "outra", "outros", "outras"] -_adv = ["aqui", "aí", "ali", "além"] - - -for orth in _per_pron + _dem_pron + _und_pron + _adv: - _exc["d" + orth] = [{ORTH: "d", NORM: "de"}, {ORTH: orth}] - -for orth in _per_pron + _dem_pron + _und_pron: - _exc["n" + orth] = [{ORTH: "n", NORM: "em"}, {ORTH: orth}] +_exc = {} for orth in [ "Adm.", + "Art.", + "art.", + "Av.", + "av.", + "Cia.", + "dom.", "Dr.", + "dr.", "e.g.", "E.g.", "E.G.", + "e/ou", + "ed.", + "eng.", + "etc.", + "Fund.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", + "Inc.", "Jr.", + "km/h", "Ltd.", + "Mr.", "p.m.", "Ph.D.", "Rep.", "Rev.", + "S/A", "Sen.", "Sr.", + "sr.", "Sra.", + "sra.", "vs.", "tel.", "pág.", From 1a944e5976b260f8ee42a52fb016808f427ef77f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:28:02 +0100 Subject: [PATCH 009/131] Improve Italian tokenization (#5204) Improve Italian tokenization for UD_Italian-ISDT. --- spacy/lang/it/__init__.py | 3 +- spacy/lang/it/punctuation.py | 36 +++++++++++++++---- spacy/lang/it/tokenizer_exceptions.py | 52 ++++++++++++++++++++++++++- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 90763eda5..06d146748 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4fa931fde..f2c1fd84a 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,15 +1,39 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import ALPHA +from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER -ELISION = " ' ’ ".strip().replace(" ", "") +ELISION = "'’" -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] +_prefixes = ( + [ + r"'[0-9][0-9]", + r"[0-9]+°", + ] + + TOKENIZER_PREFIXES +) + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), + r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) + ] +) + +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 62f568c5c..70dfe92bd 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -2,6 +2,56 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA -_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} +_exc = { + "all'art.": [{ORTH: "all'"}, {ORTH: "art."}], + "dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}], + "dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}], + "L'art.": [{ORTH: "L'"}, {ORTH: "art."}], + "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], + "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], + "po'": [{ORTH: "po'", LEMMA: "poco"}], + "sett..": [{ORTH: "sett."}, {ORTH: "."}] +} + +for orth in [ + "..", + "....", + "al.", + "all-path", + "art.", + "Art.", + "artt.", + "att.", + "by-pass", + "c.d.", + "centro-sinistra", + "check-up", + "Civ.", + "cm.", + "Cod.", + "col.", + "Cost.", + "d.C.", + 'de"' + "distr.", + "E'", + "ecc.", + "e-mail", + "e/o", + "etc.", + "Jr.", + "n°", + "nord-est", + "pag.", + "Proc.", + "prof.", + "sett.", + "s.p.a.", + "ss.", + "St.", + "tel.", + "week-end", +]: + _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = _exc From 86c43e55fa3a9557e838998bc288bb4833c2d0ec Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:28:12 +0100 Subject: [PATCH 010/131] Improve Lithuanian tokenization (#5205) * Improve Lithuanian tokenization Modify Lithuanian tokenization to improve performance for UD_Lithuanian-ALKSNIS. * Update Lithuanian tokenizer tests --- spacy/lang/lt/__init__.py | 7 +- spacy/lang/lt/punctuation.py | 29 ++ spacy/lang/lt/tokenizer_exceptions.py | 514 +++++++++++++------------- spacy/tests/lang/lt/test_text.py | 6 +- 4 files changed, 296 insertions(+), 260 deletions(-) create mode 100644 spacy/lang/lt/punctuation.py diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 7919a4858..1dfe932ee 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -26,7 +27,11 @@ class LithuanianDefaults(Language.Defaults): ) lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES + mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")} + del mod_base_exceptions["8)"] + tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP morph_rules = MORPH_RULES diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py new file mode 100644 index 000000000..5eedc8116 --- /dev/null +++ b/spacy/lang/lt/punctuation.py @@ -0,0 +1,29 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_ICONS, LIST_ELLIPSES +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA +from ..char_classes import HYPHENS +from ..punctuation import TOKENIZER_SUFFIXES + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +_suffixes = ["\."] + list(TOKENIZER_SUFFIXES) + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index fcf807278..f8e11156d 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -6,262 +6,264 @@ from ...symbols import ORTH _exc = {} for orth in [ - "G.", - "J. E.", - "J. Em.", - "J.E.", - "J.Em.", - "K.", - "N.", - "V.", - "Vt.", - "a.", - "a.k.", - "a.s.", - "adv.", - "akad.", - "aklg.", - "akt.", - "al.", - "ang.", - "angl.", - "aps.", - "apskr.", - "apyg.", - "arbat.", - "asist.", - "asm.", - "asm.k.", - "asmv.", - "atk.", - "atsak.", - "atsisk.", - "atsisk.sąsk.", - "atv.", - "aut.", - "avd.", - "b.k.", - "baud.", - "biol.", - "bkl.", - "bot.", - "bt.", - "buv.", - "ch.", - "chem.", - "corp.", - "d.", - "dab.", - "dail.", - "dek.", - "deš.", - "dir.", - "dirig.", - "doc.", - "dol.", - "dr.", - "drp.", - "dvit.", - "dėst.", - "dš.", - "dž.", - "e.b.", - "e.bankas", - "e.p.", - "e.parašas", - "e.paštas", - "e.v.", - "e.valdžia", - "egz.", - "eil.", - "ekon.", - "el.", - "el.bankas", - "el.p.", - "el.parašas", - "el.paštas", - "el.valdžia", - "etc.", - "ež.", - "fak.", - "faks.", - "feat.", - "filol.", - "filos.", - "g.", - "gen.", - "geol.", - "gerb.", - "gim.", - "gr.", - "gv.", - "gyd.", - "gyv.", - "habil.", - "inc.", - "insp.", - "inž.", - "ir pan.", - "ir t. t.", - "isp.", - "istor.", - "it.", - "just.", - "k.", - "k. a.", - "k.a.", - "kab.", - "kand.", - "kart.", - "kat.", - "ketv.", - "kh.", - "kl.", - "kln.", - "km.", - "kn.", - "koresp.", - "kpt.", - "kr.", - "kt.", - "kub.", - "kun.", - "kv.", - "kyš.", - "l. e. p.", - "l.e.p.", - "lenk.", - "liet.", - "lot.", - "lt.", - "ltd.", - "ltn.", - "m.", - "m.e..", - "m.m.", - "mat.", - "med.", - "mgnt.", - "mgr.", - "min.", - "mjr.", - "ml.", - "mln.", - "mlrd.", - "mob.", - "mok.", - "moksl.", - "mokyt.", - "mot.", - "mr.", - "mst.", - "mstl.", - "mėn.", - "nkt.", - "no.", - "nr.", - "ntk.", - "nuotr.", - "op.", - "org.", - "orig.", - "p.", - "p.d.", - "p.m.e.", - "p.s.", - "pab.", - "pan.", - "past.", - "pav.", - "pavad.", - "per.", - "perd.", - "pirm.", - "pl.", - "plg.", - "plk.", - "pr.", - "pr.Kr.", - "pranc.", - "proc.", - "prof.", - "prom.", - "prot.", - "psl.", - "pss.", - "pvz.", - "pšt.", - "r.", - "raj.", - "red.", - "rez.", - "rež.", - "rus.", - "rš.", - "s.", - "sav.", - "saviv.", - "sek.", - "sekr.", - "sen.", - "sh.", - "sk.", - "skg.", - "skv.", - "skyr.", - "sp.", - "spec.", - "sr.", - "st.", - "str.", - "stud.", - "sąs.", - "t.", - "t. p.", - "t. y.", - "t.p.", - "t.t.", - "t.y.", - "techn.", - "tel.", - "teol.", - "th.", - "tir.", - "trit.", - "trln.", - "tšk.", - "tūks.", - "tūkst.", - "up.", - "upl.", - "v.s.", - "vad.", - "val.", - "valg.", - "ved.", - "vert.", - "vet.", - "vid.", - "virš.", - "vlsč.", - "vnt.", - "vok.", - "vs.", - "vtv.", - "vv.", - "vyr.", - "vyresn.", - "zool.", - "Įn", - "įl.", - "š.m.", - "šnek.", - "šv.", - "švč.", - "ž.ū.", - "žin.", - "žml.", - "žr.", + "n-tosios", + "?!", +# "G.", +# "J. E.", +# "J. Em.", +# "J.E.", +# "J.Em.", +# "K.", +# "N.", +# "V.", +# "Vt.", +# "a.", +# "a.k.", +# "a.s.", +# "adv.", +# "akad.", +# "aklg.", +# "akt.", +# "al.", +# "ang.", +# "angl.", +# "aps.", +# "apskr.", +# "apyg.", +# "arbat.", +# "asist.", +# "asm.", +# "asm.k.", +# "asmv.", +# "atk.", +# "atsak.", +# "atsisk.", +# "atsisk.sąsk.", +# "atv.", +# "aut.", +# "avd.", +# "b.k.", +# "baud.", +# "biol.", +# "bkl.", +# "bot.", +# "bt.", +# "buv.", +# "ch.", +# "chem.", +# "corp.", +# "d.", +# "dab.", +# "dail.", +# "dek.", +# "deš.", +# "dir.", +# "dirig.", +# "doc.", +# "dol.", +# "dr.", +# "drp.", +# "dvit.", +# "dėst.", +# "dš.", +# "dž.", +# "e.b.", +# "e.bankas", +# "e.p.", +# "e.parašas", +# "e.paštas", +# "e.v.", +# "e.valdžia", +# "egz.", +# "eil.", +# "ekon.", +# "el.", +# "el.bankas", +# "el.p.", +# "el.parašas", +# "el.paštas", +# "el.valdžia", +# "etc.", +# "ež.", +# "fak.", +# "faks.", +# "feat.", +# "filol.", +# "filos.", +# "g.", +# "gen.", +# "geol.", +# "gerb.", +# "gim.", +# "gr.", +# "gv.", +# "gyd.", +# "gyv.", +# "habil.", +# "inc.", +# "insp.", +# "inž.", +# "ir pan.", +# "ir t. t.", +# "isp.", +# "istor.", +# "it.", +# "just.", +# "k.", +# "k. a.", +# "k.a.", +# "kab.", +# "kand.", +# "kart.", +# "kat.", +# "ketv.", +# "kh.", +# "kl.", +# "kln.", +# "km.", +# "kn.", +# "koresp.", +# "kpt.", +# "kr.", +# "kt.", +# "kub.", +# "kun.", +# "kv.", +# "kyš.", +# "l. e. p.", +# "l.e.p.", +# "lenk.", +# "liet.", +# "lot.", +# "lt.", +# "ltd.", +# "ltn.", +# "m.", +# "m.e..", +# "m.m.", +# "mat.", +# "med.", +# "mgnt.", +# "mgr.", +# "min.", +# "mjr.", +# "ml.", +# "mln.", +# "mlrd.", +# "mob.", +# "mok.", +# "moksl.", +# "mokyt.", +# "mot.", +# "mr.", +# "mst.", +# "mstl.", +# "mėn.", +# "nkt.", +# "no.", +# "nr.", +# "ntk.", +# "nuotr.", +# "op.", +# "org.", +# "orig.", +# "p.", +# "p.d.", +# "p.m.e.", +# "p.s.", +# "pab.", +# "pan.", +# "past.", +# "pav.", +# "pavad.", +# "per.", +# "perd.", +# "pirm.", +# "pl.", +# "plg.", +# "plk.", +# "pr.", +# "pr.Kr.", +# "pranc.", +# "proc.", +# "prof.", +# "prom.", +# "prot.", +# "psl.", +# "pss.", +# "pvz.", +# "pšt.", +# "r.", +# "raj.", +# "red.", +# "rez.", +# "rež.", +# "rus.", +# "rš.", +# "s.", +# "sav.", +# "saviv.", +# "sek.", +# "sekr.", +# "sen.", +# "sh.", +# "sk.", +# "skg.", +# "skv.", +# "skyr.", +# "sp.", +# "spec.", +# "sr.", +# "st.", +# "str.", +# "stud.", +# "sąs.", +# "t.", +# "t. p.", +# "t. y.", +# "t.p.", +# "t.t.", +# "t.y.", +# "techn.", +# "tel.", +# "teol.", +# "th.", +# "tir.", +# "trit.", +# "trln.", +# "tšk.", +# "tūks.", +# "tūkst.", +# "up.", +# "upl.", +# "v.s.", +# "vad.", +# "val.", +# "valg.", +# "ved.", +# "vert.", +# "vet.", +# "vid.", +# "virš.", +# "vlsč.", +# "vnt.", +# "vok.", +# "vs.", +# "vtv.", +# "vv.", +# "vyr.", +# "vyresn.", +# "zool.", +# "Įn", +# "įl.", +# "š.m.", +# "šnek.", +# "šv.", +# "švč.", +# "ž.ū.", +# "žin.", +# "žml.", +# "žr.", ]: _exc[orth] = [{ORTH: orth}] diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index cac32aa4d..bb9c75383 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -15,11 +15,11 @@ def test_lt_tokenizer_handles_long_text(lt_tokenizer): [ ( "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", - 15, + 17, ), ( "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", - 16, + 18, ), ], ) @@ -31,7 +31,7 @@ def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length): @pytest.mark.parametrize("text", ["km.", "pvz.", "biol."]) def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text): tokens = lt_tokenizer(text) - assert len(tokens) == 1 + assert len(tokens) == 2 @pytest.mark.parametrize( From b71dd44dbcfb6f4aa78034b4419c793972c77e62 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:28:19 +0100 Subject: [PATCH 011/131] Improved Romanian tokenization for UD RRT (#5206) Modifications to Romanian tokenization to improve tokenization for UD_Romanian-RRT. From 828acffc12d6e57f48c345196e79ffa1fb917419 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Mar 2020 12:28:12 +0100 Subject: [PATCH 012/131] Tidy up and auto-format --- spacy/cli/train.py | 28 +- spacy/displacy/__init__.py | 11 +- spacy/displacy/render.py | 21 +- spacy/errors.py | 1 - spacy/lang/de/punctuation.py | 2 +- spacy/lang/eu/examples.py | 2 +- spacy/lang/eu/lex_attrs.py | 1 - spacy/lang/eu/stop_words.py | 2 +- spacy/lang/fr/punctuation.py | 8 +- spacy/lang/fr/tokenizer_exceptions.py | 8 +- spacy/lang/it/punctuation.py | 13 +- spacy/lang/it/tokenizer_exceptions.py | 4 +- spacy/lang/lij/stop_words.py | 2 +- spacy/lang/lij/tokenizer_exceptions.py | 2 +- spacy/lang/lt/__init__.py | 4 +- spacy/lang/lt/tokenizer_exceptions.py | 512 +-- spacy/lang/nb/punctuation.py | 1 - spacy/lang/pt/tokenizer_exceptions.py | 2 +- spacy/lang/sk/tag_map.py | 2918 ++++++++--------- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/language.py | 11 +- spacy/pipeline/entityruler.py | 6 +- spacy/tests/doc/test_array.py | 2 - spacy/tests/lang/da/test_exceptions.py | 2 +- spacy/tests/lang/eu/test_text.py | 8 +- spacy/tests/lang/hu/test_tokenizer.py | 16 +- spacy/tests/matcher/test_matcher_api.py | 3 +- spacy/tests/pipeline/test_entity_ruler.py | 7 +- spacy/tests/regression/test_issue4725.py | 1 - spacy/tests/regression/test_issue4849.py | 13 +- .../serialize/test_serialize_tokenizer.py | 2 +- spacy/tests/util.py | 6 +- 32 files changed, 1828 insertions(+), 1793 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 59b0f2225..6408a6024 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -225,7 +225,9 @@ def train( exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) - disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) + disabled_pipes = nlp.disable_pipes( + [p for p in nlp.pipe_names if p not in pipeline] + ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) @@ -415,10 +417,10 @@ def train( losses=losses, ) except ValueError as e: - msg.warn("Error during training") + err = "Error during training" if init_tok2vec: - msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?") - msg.fail("Original error message: {}".format(e), exits=1) + err += " Did you provide the same parameters during 'train' as during 'pretrain'?" + msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. @@ -546,7 +548,10 @@ def train( ) break except Exception as e: - msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e)) + msg.warn( + "Aborting and saving the final best model. " + "Encountered exception: {}".format(e) + ) finally: best_pipes = nlp.pipe_names if disabled_pipes: @@ -563,13 +568,20 @@ def train( final_meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: - speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) + speed = _get_total_speed( + [final_meta["speed"]["cpu"], meta["speed"]["cpu"]] + ) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: - speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) + speed = _get_total_speed( + [final_meta["speed"]["gpu"], meta["speed"]["gpu"]] + ) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta - if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None: + if ( + final_meta["speed"]["cpu"] is None + and final_meta["speed"]["gpu"] is None + ): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index e13b0403b..922d80e57 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -146,9 +146,14 @@ def parse_deps(orig_doc, options={}): retokenizer.merge(span, attrs=attrs) fine_grained = options.get("fine_grained") add_lemma = options.get("add_lemma") - words = [{"text": w.text, - "tag": w.tag_ if fine_grained else w.pos_, - "lemma": w.lemma_ if add_lemma else None} for w in doc] + words = [ + { + "text": w.text, + "tag": w.tag_ if fine_grained else w.pos_, + "lemma": w.lemma_ if add_lemma else None, + } + for w in doc + ] arcs = [] for word in doc: diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 68df324d6..57d67c96b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,7 +3,13 @@ from __future__ import unicode_literals import uuid -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS +from .templates import ( + TPL_DEP_SVG, + TPL_DEP_WORDS, + TPL_DEP_WORDS_LEMMA, + TPL_DEP_ARCS, + TPL_ENTS, +) from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from ..util import minify_html, escape_html, registry from ..errors import Errors @@ -83,7 +89,10 @@ class DependencyRenderer(object): self.width = self.offset_x + len(words) * self.distance self.height = self.offset_y + 3 * self.word_spacing self.id = render_id - words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)] + words = [ + self.render_word(w["text"], w["tag"], w.get("lemma", None), i) + for i, w in enumerate(words) + ] arcs = [ self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i) for i, a in enumerate(arcs) @@ -101,7 +110,9 @@ class DependencyRenderer(object): lang=self.lang, ) - def render_word(self, text, tag, lemma, i,): + def render_word( + self, text, tag, lemma, i, + ): """Render individual word. text (unicode): Word text. @@ -115,7 +126,9 @@ class DependencyRenderer(object): x = self.width - x html_text = escape_html(text) if lemma is not None: - return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y) + return TPL_DEP_WORDS_LEMMA.format( + text=html_text, tag=tag, lemma=lemma, x=x, y=y + ) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) def render_arrow(self, label, start, end, direction, i): diff --git a/spacy/errors.py b/spacy/errors.py index b43b8487f..c751ad65a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -112,7 +112,6 @@ class Warnings(object): "in problems with the vocab further on in the pipeline.") - @add_codes class Errors(object): E001 = ("No component '{name}' found in pipeline. Available names: {opts}") diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index da6ab1d40..93454ffff 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES -from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT +from ..char_classes import CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py index f2d325d78..463494abd 100644 --- a/spacy/lang/eu/examples.py +++ b/spacy/lang/eu/examples.py @@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models. sentences = [ "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", - "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira" + "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira", ] diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py index c11e913db..19b75c111 100644 --- a/spacy/lang/eu/lex_attrs.py +++ b/spacy/lang/eu/lex_attrs.py @@ -59,7 +59,6 @@ behin """.split() - def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py index 208238961..dda11a7fd 100644 --- a/spacy/lang/eu/stop_words.py +++ b/spacy/lang/eu/stop_words.py @@ -5,7 +5,7 @@ from __future__ import unicode_literals # https://www.ranks.nl/stopwords/basque # https://www.mustgo.com/worldlanguages/basque/ STOP_WORDS = set( -""" + """ al anitz arabera diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index e03e91361..7d50c4a9e 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -16,7 +16,9 @@ _hyphen_suffixes += " " + _hyphen_suffixes.upper() _prefixes = TOKENIZER_PREFIXES + [ - r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)) + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) ] _suffixes = ( @@ -33,7 +35,9 @@ _suffixes = ( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), - r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)), + r"(?<=[{a}])[{h}]({hs})".format( + a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes) + ), ] ) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 56c5544a5..dfcb2756e 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -6,10 +6,10 @@ import re from .punctuation import ELISION, HYPHENS from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA -from ...symbols import ORTH, LEMMA, TAG +from ...symbols import ORTH, LEMMA # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer -#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS +# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"] @@ -93,7 +93,7 @@ for verb, verb_lemma in [ for pronoun in ["elle", "il", "on"]: token = "{}-t-{}".format(orth, pronoun) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, {LEMMA: pronoun, ORTH: "-" + pronoun}, ] @@ -102,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: token = "{}-ce".format(orth) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, ] diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index f2c1fd84a..1d641f144 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES from ..char_classes import ALPHA_LOWER, ALPHA_UPPER @@ -10,14 +10,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA_UPPER ELISION = "'’" -_prefixes = ( - [ - r"'[0-9][0-9]", - r"[0-9]+°", - - ] - + TOKENIZER_PREFIXES -) +_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES _infixes = ( @@ -31,7 +24,7 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) + r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION), ] ) diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 70dfe92bd..70519ba6a 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -10,7 +10,7 @@ _exc = { "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], "po'": [{ORTH: "po'", LEMMA: "poco"}], - "sett..": [{ORTH: "sett."}, {ORTH: "."}] + "sett..": [{ORTH: "sett."}, {ORTH: "."}], } for orth in [ @@ -32,7 +32,7 @@ for orth in [ "col.", "Cost.", "d.C.", - 'de"' + 'de"', "distr.", "E'", "ecc.", diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index 7ab34adf1..ffd53370d 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -8,7 +8,7 @@ a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri a bella belle belli bello ben -ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse +ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py index 2aa6f8304..2109add62 100644 --- a/spacy/lang/lij/tokenizer_exceptions.py +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -44,7 +44,7 @@ for prep, prep_lemma in [ ("s’", "se"), ]: for prefix_orth in [prefix, prefix.capitalize()]: - _exc[prefix_orth+prep] = [ + _exc[prefix_orth + prep] = [ {ORTH: prefix_orth, LEMMA: prefix_lemma}, {ORTH: prep, LEMMA: prep_lemma}, ] diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 1dfe932ee..ce2c8d6a4 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -29,7 +29,9 @@ class LithuanianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")} + mod_base_exceptions = { + exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") + } del mod_base_exceptions["8)"] tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index f8e11156d..4287b26dd 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -8,262 +8,262 @@ _exc = {} for orth in [ "n-tosios", "?!", -# "G.", -# "J. E.", -# "J. Em.", -# "J.E.", -# "J.Em.", -# "K.", -# "N.", -# "V.", -# "Vt.", -# "a.", -# "a.k.", -# "a.s.", -# "adv.", -# "akad.", -# "aklg.", -# "akt.", -# "al.", -# "ang.", -# "angl.", -# "aps.", -# "apskr.", -# "apyg.", -# "arbat.", -# "asist.", -# "asm.", -# "asm.k.", -# "asmv.", -# "atk.", -# "atsak.", -# "atsisk.", -# "atsisk.sąsk.", -# "atv.", -# "aut.", -# "avd.", -# "b.k.", -# "baud.", -# "biol.", -# "bkl.", -# "bot.", -# "bt.", -# "buv.", -# "ch.", -# "chem.", -# "corp.", -# "d.", -# "dab.", -# "dail.", -# "dek.", -# "deš.", -# "dir.", -# "dirig.", -# "doc.", -# "dol.", -# "dr.", -# "drp.", -# "dvit.", -# "dėst.", -# "dš.", -# "dž.", -# "e.b.", -# "e.bankas", -# "e.p.", -# "e.parašas", -# "e.paštas", -# "e.v.", -# "e.valdžia", -# "egz.", -# "eil.", -# "ekon.", -# "el.", -# "el.bankas", -# "el.p.", -# "el.parašas", -# "el.paštas", -# "el.valdžia", -# "etc.", -# "ež.", -# "fak.", -# "faks.", -# "feat.", -# "filol.", -# "filos.", -# "g.", -# "gen.", -# "geol.", -# "gerb.", -# "gim.", -# "gr.", -# "gv.", -# "gyd.", -# "gyv.", -# "habil.", -# "inc.", -# "insp.", -# "inž.", -# "ir pan.", -# "ir t. t.", -# "isp.", -# "istor.", -# "it.", -# "just.", -# "k.", -# "k. a.", -# "k.a.", -# "kab.", -# "kand.", -# "kart.", -# "kat.", -# "ketv.", -# "kh.", -# "kl.", -# "kln.", -# "km.", -# "kn.", -# "koresp.", -# "kpt.", -# "kr.", -# "kt.", -# "kub.", -# "kun.", -# "kv.", -# "kyš.", -# "l. e. p.", -# "l.e.p.", -# "lenk.", -# "liet.", -# "lot.", -# "lt.", -# "ltd.", -# "ltn.", -# "m.", -# "m.e..", -# "m.m.", -# "mat.", -# "med.", -# "mgnt.", -# "mgr.", -# "min.", -# "mjr.", -# "ml.", -# "mln.", -# "mlrd.", -# "mob.", -# "mok.", -# "moksl.", -# "mokyt.", -# "mot.", -# "mr.", -# "mst.", -# "mstl.", -# "mėn.", -# "nkt.", -# "no.", -# "nr.", -# "ntk.", -# "nuotr.", -# "op.", -# "org.", -# "orig.", -# "p.", -# "p.d.", -# "p.m.e.", -# "p.s.", -# "pab.", -# "pan.", -# "past.", -# "pav.", -# "pavad.", -# "per.", -# "perd.", -# "pirm.", -# "pl.", -# "plg.", -# "plk.", -# "pr.", -# "pr.Kr.", -# "pranc.", -# "proc.", -# "prof.", -# "prom.", -# "prot.", -# "psl.", -# "pss.", -# "pvz.", -# "pšt.", -# "r.", -# "raj.", -# "red.", -# "rez.", -# "rež.", -# "rus.", -# "rš.", -# "s.", -# "sav.", -# "saviv.", -# "sek.", -# "sekr.", -# "sen.", -# "sh.", -# "sk.", -# "skg.", -# "skv.", -# "skyr.", -# "sp.", -# "spec.", -# "sr.", -# "st.", -# "str.", -# "stud.", -# "sąs.", -# "t.", -# "t. p.", -# "t. y.", -# "t.p.", -# "t.t.", -# "t.y.", -# "techn.", -# "tel.", -# "teol.", -# "th.", -# "tir.", -# "trit.", -# "trln.", -# "tšk.", -# "tūks.", -# "tūkst.", -# "up.", -# "upl.", -# "v.s.", -# "vad.", -# "val.", -# "valg.", -# "ved.", -# "vert.", -# "vet.", -# "vid.", -# "virš.", -# "vlsč.", -# "vnt.", -# "vok.", -# "vs.", -# "vtv.", -# "vv.", -# "vyr.", -# "vyresn.", -# "zool.", -# "Įn", -# "įl.", -# "š.m.", -# "šnek.", -# "šv.", -# "švč.", -# "ž.ū.", -# "žin.", -# "žml.", -# "žr.", + # "G.", + # "J. E.", + # "J. Em.", + # "J.E.", + # "J.Em.", + # "K.", + # "N.", + # "V.", + # "Vt.", + # "a.", + # "a.k.", + # "a.s.", + # "adv.", + # "akad.", + # "aklg.", + # "akt.", + # "al.", + # "ang.", + # "angl.", + # "aps.", + # "apskr.", + # "apyg.", + # "arbat.", + # "asist.", + # "asm.", + # "asm.k.", + # "asmv.", + # "atk.", + # "atsak.", + # "atsisk.", + # "atsisk.sąsk.", + # "atv.", + # "aut.", + # "avd.", + # "b.k.", + # "baud.", + # "biol.", + # "bkl.", + # "bot.", + # "bt.", + # "buv.", + # "ch.", + # "chem.", + # "corp.", + # "d.", + # "dab.", + # "dail.", + # "dek.", + # "deš.", + # "dir.", + # "dirig.", + # "doc.", + # "dol.", + # "dr.", + # "drp.", + # "dvit.", + # "dėst.", + # "dš.", + # "dž.", + # "e.b.", + # "e.bankas", + # "e.p.", + # "e.parašas", + # "e.paštas", + # "e.v.", + # "e.valdžia", + # "egz.", + # "eil.", + # "ekon.", + # "el.", + # "el.bankas", + # "el.p.", + # "el.parašas", + # "el.paštas", + # "el.valdžia", + # "etc.", + # "ež.", + # "fak.", + # "faks.", + # "feat.", + # "filol.", + # "filos.", + # "g.", + # "gen.", + # "geol.", + # "gerb.", + # "gim.", + # "gr.", + # "gv.", + # "gyd.", + # "gyv.", + # "habil.", + # "inc.", + # "insp.", + # "inž.", + # "ir pan.", + # "ir t. t.", + # "isp.", + # "istor.", + # "it.", + # "just.", + # "k.", + # "k. a.", + # "k.a.", + # "kab.", + # "kand.", + # "kart.", + # "kat.", + # "ketv.", + # "kh.", + # "kl.", + # "kln.", + # "km.", + # "kn.", + # "koresp.", + # "kpt.", + # "kr.", + # "kt.", + # "kub.", + # "kun.", + # "kv.", + # "kyš.", + # "l. e. p.", + # "l.e.p.", + # "lenk.", + # "liet.", + # "lot.", + # "lt.", + # "ltd.", + # "ltn.", + # "m.", + # "m.e..", + # "m.m.", + # "mat.", + # "med.", + # "mgnt.", + # "mgr.", + # "min.", + # "mjr.", + # "ml.", + # "mln.", + # "mlrd.", + # "mob.", + # "mok.", + # "moksl.", + # "mokyt.", + # "mot.", + # "mr.", + # "mst.", + # "mstl.", + # "mėn.", + # "nkt.", + # "no.", + # "nr.", + # "ntk.", + # "nuotr.", + # "op.", + # "org.", + # "orig.", + # "p.", + # "p.d.", + # "p.m.e.", + # "p.s.", + # "pab.", + # "pan.", + # "past.", + # "pav.", + # "pavad.", + # "per.", + # "perd.", + # "pirm.", + # "pl.", + # "plg.", + # "plk.", + # "pr.", + # "pr.Kr.", + # "pranc.", + # "proc.", + # "prof.", + # "prom.", + # "prot.", + # "psl.", + # "pss.", + # "pvz.", + # "pšt.", + # "r.", + # "raj.", + # "red.", + # "rez.", + # "rež.", + # "rus.", + # "rš.", + # "s.", + # "sav.", + # "saviv.", + # "sek.", + # "sekr.", + # "sen.", + # "sh.", + # "sk.", + # "skg.", + # "skv.", + # "skyr.", + # "sp.", + # "spec.", + # "sr.", + # "st.", + # "str.", + # "stud.", + # "sąs.", + # "t.", + # "t. p.", + # "t. y.", + # "t.p.", + # "t.t.", + # "t.y.", + # "techn.", + # "tel.", + # "teol.", + # "th.", + # "tir.", + # "trit.", + # "trln.", + # "tšk.", + # "tūks.", + # "tūkst.", + # "up.", + # "upl.", + # "v.s.", + # "vad.", + # "val.", + # "valg.", + # "ved.", + # "vert.", + # "vet.", + # "vid.", + # "virš.", + # "vlsč.", + # "vnt.", + # "vok.", + # "vs.", + # "vtv.", + # "vv.", + # "vyr.", + # "vyresn.", + # "zool.", + # "Įn", + # "įl.", + # "š.m.", + # "šnek.", + # "šv.", + # "švč.", + # "ž.ū.", + # "žin.", + # "žml.", + # "žr.", ]: _exc[orth] = [{ORTH: orth}] diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index 7672809ec..4c10b5a68 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -24,7 +24,6 @@ _prefixes = ( ) - _infixes = ( LIST_ELLIPSES + _list_icons diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index c36af6771..981c0624b 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, NORM +from ...symbols import ORTH _exc = {} diff --git a/spacy/lang/sk/tag_map.py b/spacy/lang/sk/tag_map.py index 015c8cba3..28b36d3c1 100644 --- a/spacy/lang/sk/tag_map.py +++ b/spacy/lang/sk/tag_map.py @@ -1,1467 +1,1467 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON +from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB +from ...symbols import NOUN, PART, INTJ, PRON # Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html # fmt: off TAG_MAP = { - "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "Dx": {POS: ADV, "morph": "Degree=Pos"}, - "Dy": {POS: ADV, "morph": "Degree=Cmp"}, - "Dz": {POS: ADV, "morph": "Degree=Sup"}, - "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, - "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, - "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, - "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, - "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, - "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, - "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, - "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, - "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, - "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, - "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, - "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "J": {POS: INTJ, "morph": "_"}, - "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "ND": {POS: NUM, "morph": "MorphPos=Adv"}, - "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "O": {POS: CCONJ, "morph": "_"}, - "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, - "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, - "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "Q": {POS: X, "morph": "Hyph=Yes"}, - "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, - "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "T": {POS: PART, "morph": "_"}, - "TY": {POS: PART, "morph": "Mood=Cnd"}, - "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, - "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, - "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, - "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, - "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, - "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, - "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, - "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, - "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, - "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, - "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, - "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, - "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "W": {POS: X, "morph": "Abbr=Yes"}, - "Y": {POS: AUX, "morph": "Mood=Cnd"}, + "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "Dx": {POS: ADV, "morph": "Degree=Pos"}, + "Dy": {POS: ADV, "morph": "Degree=Cmp"}, + "Dz": {POS: ADV, "morph": "Degree=Sup"}, + "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, + "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, + "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, + "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, + "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, + "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, + "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, + "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, + "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, + "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, + "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, + "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "J": {POS: INTJ, "morph": "_"}, + "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "ND": {POS: NUM, "morph": "MorphPos=Adv"}, + "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "O": {POS: CCONJ, "morph": "_"}, + "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, + "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, + "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "Q": {POS: X, "morph": "Hyph=Yes"}, + "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, + "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "T": {POS: PART, "morph": "_"}, + "TY": {POS: PART, "morph": "Mood=Cnd"}, + "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, + "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, + "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, + "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, + "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, + "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, + "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, + "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, + "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, + "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, + "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, + "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, + "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "W": {POS: X, "morph": "Abbr=Yes"}, + "Y": {POS: AUX, "morph": "Mood=Cnd"}, } diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 2c0fc9cf7..385afb8bd 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -37,7 +37,7 @@ URL_PATTERN = ( r"|" # host & domain names # mods: match is case-sensitive, so include [A-Z] - "(?:" + "(?:" # noqa "(?:" "[A-Za-z0-9\u00a1-\uffff]" "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" diff --git a/spacy/language.py b/spacy/language.py index f0928b1f9..56619080d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -612,7 +612,7 @@ class Language(object): link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: cfg["pretrained_vectors"] = self.vocab.vectors.name - cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] + cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1] if sgd is None: sgd = create_default_optimizer(Model.ops) self._optimizer = sgd @@ -857,7 +857,14 @@ class Language(object): procs = [ mp.Process( target=_apply_pipes, - args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS), + args=( + self.make_doc, + pipes, + rch, + sch, + Underscore.get_state(), + load_nlp.VECTORS, + ), ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index c3ef429e9..1786dda87 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -222,11 +222,9 @@ class EntityRuler(object): for label, pattern, ent_id in zip( phrase_pattern_labels, self.nlp.pipe(phrase_pattern_texts), - phrase_pattern_ids + phrase_pattern_ids, ): - phrase_pattern = { - "label": label, "pattern": pattern, "id": ent_id - } + phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id} if ent_id: phrase_pattern["id"] = ent_id phrase_patterns.append(phrase_pattern) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 1c0c79f6e..09a6f9c4b 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -71,9 +71,7 @@ def test_doc_array_to_from_string_attrs(en_vocab, attrs): def test_doc_array_idx(en_vocab): """Test that Doc.to_array can retrieve token start indices""" words = ["An", "example", "sentence"] - doc = Doc(en_vocab, words=words) offsets = Doc(en_vocab, words=words).to_array("IDX") - assert offsets[0] == 0 assert offsets[1] == 3 assert offsets[2] == 11 diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index f98030621..837ceb323 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -59,7 +59,7 @@ def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): ("Sprogteknologi a/s", 2), ("De boede i A/B Bellevue", 5), # note: skipping due to weirdness in UD_Danish-DDT - #("Rotorhastigheden er 3400 o/m.", 5), + # ("Rotorhastigheden er 3400 o/m.", 5), ("Jeg købte billet t/r.", 5), ("Murerarbejdsmand m/k søges", 3), ("Netværket kører over TCP/IP", 4), diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py index e73917ffa..f448a7859 100644 --- a/spacy/tests/lang/eu/test_text.py +++ b/spacy/tests/lang/eu/test_text.py @@ -10,7 +10,13 @@ def test_eu_tokenizer_handles_long_text(eu_tokenizer): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)]) +@pytest.mark.parametrize( + "text,length", + [ + ("milesker ederra joan zen hitzaldia plazer hutsa", 7), + ("astelehen guztia sofan pasau biot", 5), + ], +) def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): tokens = eu_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index 2fceece49..1ac6bfc76 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -297,12 +297,7 @@ WIKI_TESTS = [ ] EXTRA_TESTS = ( - DOT_TESTS - + QUOTE_TESTS - + NUMBER_TESTS - + HYPHEN_TESTS - + WIKI_TESTS - + TYPO_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS ) # normal: default tests + 10% of extra tests @@ -311,7 +306,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0]) # slow: remaining 90% of extra tests SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] -TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS]) +TESTS.extend( + [ + pytest.param(x[0], x[1], marks=pytest.mark.slow()) + if not isinstance(x[0], tuple) + else x + for x in SLOW_TESTS + ] +) @pytest.mark.parametrize("text,expected_tokens", TESTS) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index a826a0a0e..c0314f3c3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,7 +6,8 @@ import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token -from ..doc.test_underscore import clean_underscore + +from ..doc.test_underscore import clean_underscore # noqa: F401 @pytest.fixture diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 3b46baa9b..b6e3c40c9 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -152,10 +152,5 @@ def test_entity_ruler_validate(nlp): def test_entity_ruler_properties(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - assert sorted(ruler.labels) == sorted([ - "HELLO", - "BYE", - "COMPLEX", - "TECH_ORG" - ]) + assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) assert sorted(ruler.ent_ids) == ["a1", "a2"] diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index f80f19852..57675a202 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -23,4 +23,3 @@ def test_issue4725(): docs = ["Kurt is in London."] * 10 for _ in nlp.pipe(docs, batch_size=2, n_process=2): pass - diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 834219773..5c7ffc999 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -9,11 +9,12 @@ def test_issue4849(): nlp = English() ruler = EntityRuler( - nlp, patterns=[ - {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, - {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'}, + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, ], - phrase_matcher_attr="LOWER" + phrase_matcher_attr="LOWER", ) nlp.add_pipe(ruler) @@ -27,10 +28,10 @@ def test_issue4849(): count_ents = 0 for doc in nlp.pipe([text], n_process=1): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert(count_ents == 2) + assert count_ents == 2 # USING 2 PROCESSES count_ents = 0 for doc in nlp.pipe([text], n_process=2): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert (count_ents == 2) + assert count_ents == 2 diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 0e0816a55..cbe119225 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -22,7 +22,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) - tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]}) + tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]}) tokenizer.rules = {} tokenizer_bytes = tokenizer.to_bytes() tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index a0d6273a9..4e1c50398 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -28,7 +28,9 @@ def make_tempdir(): shutil.rmtree(path2str(d)) -def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): +def get_doc( + vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None +): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: heads = [0] * len(deps) @@ -60,7 +62,7 @@ def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=No if attrs.ndim == 1: attrs[i] = heads[i] else: - attrs[i,j] = heads[i] + attrs[i, j] = heads[i] else: for i in range(len(words)): if attrs.ndim == 1: From d88a377bed122018dd54b4228f48b73bee6881b1 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:45:47 +0100 Subject: [PATCH 013/131] Remove Vectors.from_glove (#5209) --- spacy/vectors.pyx | 38 ------------------------ website/docs/api/vectors.md | 19 ------------ website/docs/usage/vectors-similarity.md | 31 ------------------- 3 files changed, 88 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index c6526b89d..f8643640a 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -355,44 +355,6 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) - def from_glove(self, path): - """Load GloVe vectors from a directory. Assumes binary format, - that the vocab is in a vocab.txt, and that vectors are named - vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 - vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. - By default GloVe outputs 64-bit vectors. - - path (unicode / Path): The path to load the GloVe vectors from. - RETURNS: A `StringStore` object, holding the key-to-string mapping. - - DOCS: https://spacy.io/api/vectors#from_glove - """ - path = util.ensure_path(path) - width = None - for name in path.iterdir(): - if name.parts[-1].startswith("vectors"): - _, dims, dtype, _2 = name.parts[-1].split('.') - width = int(dims) - break - else: - raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) - xp = get_array_module(self.data) - self.data = None - with bin_loc.open("rb") as file_: - self.data = xp.fromfile(file_, dtype=dtype) - if dtype != "float32": - self.data = xp.ascontiguousarray(self.data, dtype="float32") - if self.data.ndim == 1: - self.data = self.data.reshape((self.data.size//width, width)) - n = 0 - strings = StringStore() - with (path / "vocab.txt").open("r") as file_: - for i, line in enumerate(file_): - key = strings.add(line.strip()) - self.add(key, row=i) - return strings - def to_disk(self, path, **kwargs): """Save the current state to a directory. diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 3588672db..93e747c1e 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -326,25 +326,6 @@ performed in chunks, to avoid consuming too much memory. You can set the | `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. | | **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. | -## Vectors.from_glove {#from_glove tag="method"} - -Load [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. -Assumes binary format, that the vocab is in a `vocab.txt`, and that vectors are -named `vectors.{size}.[fd.bin]`, e.g. `vectors.128.f.bin` for 128d float32 -vectors, `vectors.300.d.bin` for 300d float64 (double) vectors, etc. By default -GloVe outputs 64-bit vectors. - -> #### Example -> -> ```python -> vectors = Vectors() -> vectors.from_glove("/path/to/glove_vectors") -> ``` - -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------- | -| `path` | unicode / `Path` | The path to load the GloVe vectors from. | - ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index 93ba67704..9b65bb80a 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -177,37 +177,6 @@ for word, vector in vector_data.items(): vocab.set_vector(word, vector) ``` -### Loading GloVe vectors {#custom-loading-glove new="2"} - -spaCy comes with built-in support for loading -[GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. The -[`Vectors.from_glove`](/api/vectors#from_glove) method assumes a binary format, -the vocab provided in a `vocab.txt`, and the naming scheme of -`vectors.{size}.[fd`.bin]. For example: - -```yaml -### Directory structure -└── vectors - ├── vectors.128.f.bin # vectors file - └── vocab.txt # vocabulary -``` - -| File name | Dimensions | Data type | -| ------------------- | ---------- | ---------------- | -| `vectors.128.f.bin` | 128 | float32 | -| `vectors.300.d.bin` | 300 | float64 (double) | - -```python -nlp = spacy.load("en_core_web_sm") -nlp.vocab.vectors.from_glove("/path/to/vectors") -``` - -If your instance of `Language` already contains vectors, they will be -overwritten. To create your own GloVe vectors model package like spaCy's -[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call -[`nlp.to_disk`](/api/language#to_disk), and then package the model using the -[`package`](/api/cli#package) command. - ### Using custom similarity methods {#custom-similarity} By default, [`Token.vector`](/api/token#vector) returns the vector for its From a04f8020993568e5677cdbce96e93c82cf6e012f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:46:23 +0100 Subject: [PATCH 014/131] Fix GoldParse init when token count differs (#5191) Fix the `GoldParse` initialization when the number of tokens has changed (due to merging subtokens with the parser). --- spacy/scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 7b05b11fd..25c660240 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -231,7 +231,7 @@ class Scorer(object): """ if len(doc) != len(gold): gold = GoldParse.from_annot_tuples( - doc, tuple(zip(*gold.orig_annot)) + (gold.cats,) + doc, zip(*gold.orig_annot), cats=gold.cats, ) gold_deps = set() gold_deps_per_dep = {} From 8d3563f1c463852758a8fb323e8ddc7aa73b81bc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:46:50 +0100 Subject: [PATCH 015/131] Minor bugfixes for train CLI (#5186) * Omit per_type scores from model-best calculations The addition of per_type scores to the included metrics (#4911) causes errors when they're compared while determining the best model, so omit them for this `max()` comparison. * Add default speed data for interrupted train CLI Add better speed meta defaults so that an interrupted iteration still produces a best model. Co-authored-by: Ines Montani --- spacy/cli/train.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6408a6024..c94c26b62 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -566,6 +566,9 @@ def train( final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) + meta.setdefault("speed", {}) + meta["speed"].setdefault("cpu", None) + meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( @@ -673,6 +676,8 @@ def _find_best(experiment_dir, component): if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": accs = srsly.read_json(epoch_model / "accuracy.json") scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] + # remove per_type dicts from score list for max() comparison + scores = [score for score in scores if isinstance(score, float)] accuracies.append((scores, epoch_model)) if accuracies: return max(accuracies)[1] From e53232533b788bb303108f07443b37529051ef14 Mon Sep 17 00:00:00 2001 From: Tiljander <35637838+Tiljander@users.noreply.github.com> Date: Thu, 26 Mar 2020 13:13:22 +0100 Subject: [PATCH 016/131] Describing priority rules for overlapping matches (#5197) * Describing priority rules for overlapping matches * Create Tiljander.md * Describing priority rules for overlapping matches * Update website/docs/api/entityruler.md Co-Authored-By: Ines Montani Co-authored-by: Ines Montani --- .github/contributors/Tiljander.md | 106 ++++++++++++++++++++++ website/docs/api/entityruler.md | 3 +- website/docs/usage/rule-based-matching.md | 5 +- 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/Tiljander.md diff --git a/.github/contributors/Tiljander.md b/.github/contributors/Tiljander.md new file mode 100644 index 000000000..89e70efa5 --- /dev/null +++ b/.github/contributors/Tiljander.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Henrik Tiljander | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 24/3/2020 | +| GitHub username | Tiljander | +| Website (optional) | | diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index af3db0dcb..0fd24897d 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -83,7 +83,8 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. +with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer +patterns over shorter, and if equal the match occuring first in the Doc is chosen. > #### Example > diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 0ab74034e..1db2405d1 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -968,7 +968,10 @@ pattern. The entity ruler accepts two types of patterns: The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a text, it will find matches in the `doc` and add them as entities to -the `doc.ents`, using the specified pattern label as the entity label. +the `doc.ents`, using the specified pattern label as the entity label. If any +matches were to overlap, the pattern matching most tokens takes priority. If +they also happen to be equally long, then the match occuring first in the Doc is +chosen. ```python ### {executable="true"} From d1ddfa1cb736f4a52d8073e99289d009bf7d5ad9 Mon Sep 17 00:00:00 2001 From: Nikhil Saldanha Date: Sat, 28 Mar 2020 18:13:02 +0100 Subject: [PATCH 017/131] update docs for EntityRecognizer.predict return type was wrongly written as a tuple, changed to syntax.StateClass --- website/docs/api/entityrecognizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 9a2766c07..9345ee249 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -105,7 +105,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. | Name | Type | Description | | ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. | +| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | ## EntityRecognizer.set_annotations {#set_annotations tag="method"} From be6d10517fd7059765c73ec30b5dc96382fbd786 Mon Sep 17 00:00:00 2001 From: Nikhil Saldanha Date: Sat, 28 Mar 2020 18:36:55 +0100 Subject: [PATCH 018/131] sign contributor agreement --- .github/contributors/nikhilsaldanha.md | 106 +++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/nikhilsaldanha.md diff --git a/.github/contributors/nikhilsaldanha.md b/.github/contributors/nikhilsaldanha.md new file mode 100644 index 000000000..76b60beb6 --- /dev/null +++ b/.github/contributors/nikhilsaldanha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Nikhil Saldanha | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020/03/28 | +| GitHub username | nikhilsaldanha | +| Website (optional) | | From 963bd890c1d3aa874b6da194c9b5316cffbce341 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 29 Mar 2020 13:51:20 +0200 Subject: [PATCH 019/131] Modify Vector.resize to work with cupy and improve resizing (#5216) * Modify Vector.resize to work with cupy Modify `Vectors.resize` to work with cupy. Modify behavior when resizing to a different vector dimension so that individual vectors are truncated or extended with zeros instead of having the original values filled into the new shape without regard for the original axes. * Update spacy/tests/vocab_vectors/test_vectors.py Co-Authored-By: Matthew Honnibal Co-authored-by: Matthew Honnibal --- spacy/errors.py | 1 + spacy/tests/vocab_vectors/test_vectors.py | 25 ++++++++++++++++------- spacy/vectors.pyx | 12 ++++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c751ad65a..b124fc88c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -551,6 +551,7 @@ class Errors(object): "array.") E191 = ("Invalid head: the head token must be from the same doc as the " "token itself.") + E192 = ("Unable to resize vectors in place with cupy.") @add_codes diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index b688ab9dd..8987b7c89 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -89,17 +89,28 @@ def test_init_vectors_with_resize_data(data, resize_data): assert v.shape != data.shape -def test_get_vector_resize(strings, data, resize_data): - v = Vectors(data=data) - v.resize(shape=resize_data.shape) +def test_get_vector_resize(strings, data): strings = [hash_string(s) for s in strings] + + # decrease vector dimension (truncate) + v = Vectors(data=data) + resized_dim = v.shape[1] - 1 + v.resize(shape=(v.shape[0], resized_dim)) for i, string in enumerate(strings): v.add(string, row=i) - assert list(v[strings[0]]) == list(resize_data[0]) - assert list(v[strings[0]]) != list(resize_data[1]) - assert list(v[strings[1]]) != list(resize_data[0]) - assert list(v[strings[1]]) == list(resize_data[1]) + assert list(v[strings[0]]) == list(data[0, :resized_dim]) + assert list(v[strings[1]]) == list(data[1, :resized_dim]) + + # increase vector dimension (pad with zeros) + v = Vectors(data=data) + resized_dim = v.shape[1] + 1 + v.resize(shape=(v.shape[0], resized_dim)) + for i, string in enumerate(strings): + v.add(string, row=i) + + assert list(v[strings[0]]) == list(data[0]) + [0] + assert list(v[strings[1]]) == list(data[1]) + [0] def test_init_vectors_with_data(strings, data): diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f8643640a..5b8512970 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -198,11 +198,17 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#resize """ + xp = get_array_module(self.data) if inplace: - self.data.resize(shape, refcheck=False) + if xp == numpy: + self.data.resize(shape, refcheck=False) + else: + raise ValueError(Errors.E192) else: - xp = get_array_module(self.data) - self.data = xp.resize(self.data, shape) + resized_array = xp.zeros(shape, dtype=self.data.dtype) + copy_shape = (min(shape[0], self.data.shape[0]), min(shape[1], self.data.shape[1])) + resized_array[:copy_shape[0], :copy_shape[1]] = self.data[:copy_shape[0], :copy_shape[1]] + self.data = resized_array filled = {row for row in self.key2row.values()} self._unset = cppset[int]({row for row in range(shape[0]) if row not in filled}) removed_items = [] From e9049581159849bd4a710b9196cb0b78d5cf9dac Mon Sep 17 00:00:00 2001 From: Tom Milligan Date: Sun, 29 Mar 2020 12:52:08 +0100 Subject: [PATCH 020/131] Limit to cupy-cuda v8, so as not to pull in v9 automatically. (#5194) --- .github/contributors/tommilligan.md | 106 ++++++++++++++++++++++++++++ setup.cfg | 12 ++-- 2 files changed, 112 insertions(+), 6 deletions(-) create mode 100644 .github/contributors/tommilligan.md diff --git a/.github/contributors/tommilligan.md b/.github/contributors/tommilligan.md new file mode 100644 index 000000000..475df5afa --- /dev/null +++ b/.github/contributors/tommilligan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, + object code, patch, tool, sample, graphic, specification, manual, + documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and + registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment + to any third party, you hereby grant to us a perpetual, irrevocable, + non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your + contribution. The rights that you grant to us under these terms are effective + on the date you first submitted a contribution to us, even if your submission + took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + - Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + - to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + - each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable + U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT + mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ------------ | +| Name | Tom Milligan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-24 | +| GitHub username | tommilligan | +| Website (optional) | | diff --git a/setup.cfg b/setup.cfg index e44e32bb2..465367ff6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,17 +61,17 @@ install_requires = lookups = spacy_lookups_data>=0.0.5,<0.2.0 cuda = - cupy>=5.0.0b4 + cupy>=5.0.0b4,<9.0.0 cuda80 = - cupy-cuda80>=5.0.0b4 + cupy-cuda80>=5.0.0b4,<9.0.0 cuda90 = - cupy-cuda90>=5.0.0b4 + cupy-cuda90>=5.0.0b4,<9.0.0 cuda91 = - cupy-cuda91>=5.0.0b4 + cupy-cuda91>=5.0.0b4,<9.0.0 cuda92 = - cupy-cuda92>=5.0.0b4 + cupy-cuda92>=5.0.0b4,<9.0.0 cuda100 = - cupy-cuda100>=5.0.0b4 + cupy-cuda100>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = fugashi>=0.1.3 From d47b810ba4f0e50ea5b377895974e0d3e3da828d Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 29 Mar 2020 13:52:34 +0200 Subject: [PATCH 021/131] Fix exclusive_classes in textcat ensemble (#5166) Pass the exclusive_classes setting to the bow model within the ensemble textcat model. --- spacy/_ml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index fb7d39255..ee7e59218 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -693,9 +693,11 @@ def build_text_classifier(nr_class, width=64, **cfg): ) linear_model = build_bow_text_classifier( - nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False + nr_class, + ngram_size=cfg.get("ngram_size", 1), + exclusive_classes=cfg.get("exclusive_classes", False), ) - if cfg.get("exclusive_classes"): + if cfg.get("exclusive_classes", False): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( From 4f27a24f5b78283435de85bca40b844c15b2cf4e Mon Sep 17 00:00:00 2001 From: Nikhil Saldanha Date: Sun, 29 Mar 2020 13:54:42 +0200 Subject: [PATCH 022/131] Add kannada examples (#5162) * Add example sentences for Kannada * sign contributor agreement --- .github/contributors/nikhilsaldanha.md | 106 +++++++++++++++++++++++++ spacy/lang/kn/examples.py | 22 +++++ 2 files changed, 128 insertions(+) create mode 100644 .github/contributors/nikhilsaldanha.md create mode 100644 spacy/lang/kn/examples.py diff --git a/.github/contributors/nikhilsaldanha.md b/.github/contributors/nikhilsaldanha.md new file mode 100644 index 000000000..f8d37d709 --- /dev/null +++ b/.github/contributors/nikhilsaldanha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Nikhil Saldanha | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-17 | +| GitHub username | nikhilsaldanha | +| Website (optional) | | diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py new file mode 100644 index 000000000..d82630432 --- /dev/null +++ b/spacy/lang/kn/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.", + "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.", + "ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್‌ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ​​ಪರಿಗಣಿಸುತ್ತದೆ.", + "ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್‌ಡಂನ ದೊಡ್ಡ ನಗರ.", + "ನೀನು ಎಲ್ಲಿದಿಯಾ?", + "ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?", + "ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?", + "ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?", +] From 0b76212831f8dad97af6a17d220d7dcdeb02aace Mon Sep 17 00:00:00 2001 From: Jacob Lauritzen Date: Thu, 2 Apr 2020 10:42:35 +0200 Subject: [PATCH 023/131] Extend and fix Danish examples (#5227) * Extend and fix Danish examples This PR fixes two examples, adds additional examples translated from the english version, and adds punctuation. The two changed examples are: * "fortov" changed to "fortovet", which is more [used](https://www.google.com/search?client=firefox-b-d&sxsrf=ALeKk0143gEuPe4IbIUpzBBt-oU10OMVqA%3A1585549036477&ei=7I6BXuvJHMGOrwSqi46oCQ&q=l%C3%B8behjul+p%C3%A5+fortov&oq=l%C3%B8behjul+p%C3%A5+fortov&gs_lcp=CgZwc3ktYWIQAzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQR1DT8xZY0_MWYK_0FmgAcAZ4AIABAIgBAJIBAJgBAKABAaoBB2d3cy13aXo&sclient=psy-ab&ved=0ahUKEwjr7964xsHoAhVBx4sKHaqFA5UQ4dUDCAo&uact=5) and more natural. The Swedish and Norwegian examples also use this version of the word. * "stor by" changed to "storby". In Danish we have a specific noun to describe a large, metropolitan city which is different from just describing a city as "large". In this sentence it would be much more natural to describe London as a "storby". Google even correct as search for "London stor by" to "London storby". * Sign contrib agreement --- .github/contributors/jacse.md | 106 ++++++++++++++++++++++++++++++++++ spacy/lang/da/examples.py | 13 +++-- 2 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 .github/contributors/jacse.md diff --git a/.github/contributors/jacse.md b/.github/contributors/jacse.md new file mode 100644 index 000000000..7face10c3 --- /dev/null +++ b/.github/contributors/jacse.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jacob Lauritzen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-30 | +| GitHub username | jacse | +| Website (optional) | | diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index b535191a1..525c6519c 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -9,10 +9,13 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ - "Apple overvejer at købe et britisk startup for 1 milliard dollar", - "Selvkørende biler flytter forsikringsansvaret over på producenterne", - "San Francisco overvejer at forbyde udbringningsrobotter på fortov", - "London er en stor by i Storbritannien", + "Apple overvejer at købe et britisk startup for 1 milliard dollar.", + "Selvkørende biler flytter forsikringsansvaret over på producenterne.", + "San Francisco overvejer at forbyde udbringningsrobotter på fortovet.", + "London er en storby i Storbritannien.", + "Hvor er du?", + "Hvem er Frankrings president?", + "Hvad er hovedstaden i USA?", + "Hvornår blev Barack Obama født?", ] From d107afcffbf50aff63a7e15ecb3cf3f5a6fedbb7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 2 Apr 2020 10:43:13 +0200 Subject: [PATCH 024/131] Raise error for inplace resize with new vector dim (#5228) Raise an error if there is an attempt to resize the vectors in place with a different vector dimension. --- spacy/errors.py | 3 +++ spacy/vectors.pyx | 2 ++ 2 files changed, 5 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index b124fc88c..e0ddc86c5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -552,6 +552,9 @@ class Errors(object): E191 = ("Invalid head: the head token must be from the same doc as the " "token itself.") E192 = ("Unable to resize vectors in place with cupy.") + E193 = ("Unable to resize vectors in place if the resized vector dimension " + "({new_dim}) is not the same as the current vector dimension " + "({curr_dim}).") @add_codes diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 5b8512970..f3c20fb7f 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -200,6 +200,8 @@ cdef class Vectors: """ xp = get_array_module(self.data) if inplace: + if shape[1] != self.data.shape[1]: + raise ValueError(Errors.E193.format(new_dim=shape[1], curr_dim=self.data.shape[1])) if xp == numpy: self.data.resize(shape, refcheck=False) else: From 2b14997b68e2a737d9569926c3b13ee0870b4d76 Mon Sep 17 00:00:00 2001 From: Michael Leichtfried <22801077+leicmi@users.noreply.github.com> Date: Thu, 2 Apr 2020 14:47:42 +0200 Subject: [PATCH 025/131] Remove duplicated branch in if/else-if statement (#5234) * Remove duplicated branch in if-elif-statement * Add contributor agreement for leicmi --- .github/contributors/leicmi.md | 106 +++++++++++++++++++++++++++++++++ spacy/lemmatizer.py | 2 - 2 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/leicmi.md diff --git a/.github/contributors/leicmi.md b/.github/contributors/leicmi.md new file mode 100644 index 000000000..6a65a48f2 --- /dev/null +++ b/.github/contributors/leicmi.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Michael Leichtfried | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 30.03.2020 | +| GitHub username | leicmi | +| Website (optional) | | diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d70e4cfc4..33908eecf 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -98,8 +98,6 @@ class Lemmatizer(object): return True elif morphology.get("VerbForm") == "none": return True - elif morphology.get("VerbForm") == "inf": - return True elif morphology.get("Degree") == "pos": return True else: From 9cf965c26056065d6476b2a4336a42423bef3600 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 2 Apr 2020 15:04:15 +0200 Subject: [PATCH 026/131] avoid enumerate to avoid long waiting at 0% (#5159) --- .../wikipedia_processor.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 315b1e916..ed3c35c43 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard= if not labels_discard: labels_discard = [] - texts = [] - entities_list = [] + max_index = max(line_ids) - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): + with entity_file_path.open("r", encoding="utf8") as _file: + line = _file.readline() + i = 0 + while line and i < max_index: if i in line_ids: example = json.loads(line) article_id = example["article_id"] @@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard= if dev != is_dev(article_id) or not is_valid_article(clean_text): continue - texts.append(clean_text) - entities_list.append(entities) - - docs = nlp.pipe(texts, batch_size=50) - - for doc, entities in zip(docs, entities_list): - gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) - if gold and len(gold.links) > 0: - yield doc, gold + doc = nlp(clean_text) + gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) + if gold and len(gold.links) > 0: + yield doc, gold + i += 1 + line = _file.readline() def _get_gold_parse(doc, entities, dev, kb, labels_discard): From 11374208404531da28b4e17d561e821a99f542bd Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 3 Apr 2020 13:01:43 +0200 Subject: [PATCH 027/131] Small doc fixes (#5250) * fix link * torchtext instead tochtext --- website/docs/usage/linguistic-features.md | 2 +- website/meta/universe.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..59712939a 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1303,7 +1303,7 @@ with doc.retokenize() as retokenizer: ### Overwriting custom extension attributes {#retokenization-extensions} If you've registered custom -[extension attributes](/usage/processing-pipelines##custom-components-attributes), +[extension attributes](/usage/processing-pipelines#custom-components-attributes), you can overwrite them during tokenization by providing a dictionary of attribute names mapped to new values as the `"_"` key in the `attrs`. For merging, you need to provide one dictionary of attributes for the resulting diff --git a/website/meta/universe.json b/website/meta/universe.json index 23d052bb9..613648d8c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -641,7 +641,7 @@ "tags": ["chatbots"] }, { - "id": "tochtext", + "id": "torchtext", "title": "torchtext", "slogan": "Data loaders and abstractions for text and NLP", "github": "pytorch/text", From beef184e53f5fed4721a69190958a6b0b4cf6a89 Mon Sep 17 00:00:00 2001 From: YohannesDatasci <62481491+YohannesDatasci@users.noreply.github.com> Date: Fri, 3 Apr 2020 15:02:18 +0400 Subject: [PATCH 028/131] Armenian language support (#5246) * add Armenian language and test cases * agreement submission --- .github/contributors/YohannesDatasci.md | 106 + spacy/lang/hy/__init__.py | 25 + spacy/lang/hy/examples.py | 16 + spacy/lang/hy/lex_attrs.py | 58 + spacy/lang/hy/stop_words.py | 110 + spacy/lang/hy/tag_map.py | 2478 +++++++++++++++++++++++ spacy/tests/conftest.py | 4 + spacy/tests/lang/hy/test_text.py | 10 + spacy/tests/lang/hy/test_tokenizer.py | 47 + 9 files changed, 2854 insertions(+) create mode 100644 .github/contributors/YohannesDatasci.md create mode 100644 spacy/lang/hy/__init__.py create mode 100644 spacy/lang/hy/examples.py create mode 100644 spacy/lang/hy/lex_attrs.py create mode 100644 spacy/lang/hy/stop_words.py create mode 100644 spacy/lang/hy/tag_map.py create mode 100644 spacy/tests/lang/hy/test_text.py create mode 100644 spacy/tests/lang/hy/test_tokenizer.py diff --git a/.github/contributors/YohannesDatasci.md b/.github/contributors/YohannesDatasci.md new file mode 100644 index 000000000..129c45576 --- /dev/null +++ b/.github/contributors/YohannesDatasci.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Yohannes | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-02 | +| GitHub username | YohannesDatasci | +| Website (optional) | | \ No newline at end of file diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py new file mode 100644 index 000000000..3320edb6c --- /dev/null +++ b/spacy/lang/hy/__init__.py @@ -0,0 +1,25 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .tag_map import TAG_MAP + + +from ...attrs import LANG +from ...language import Language +from ...tokens import Doc + + +class ArmenianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "hy" + + lex_attr_getters.update(LEX_ATTRS) + stop_words = STOP_WORDS + tag_map = TAG_MAP + + +class Armenian(Language): + lang = "hy" + Defaults = ArmenianDefaults + + +__all__ = ["Armenian"] diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py new file mode 100644 index 000000000..b0df31aae --- /dev/null +++ b/spacy/lang/hy/examples.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.hy.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", + "Ո՞վ է Ֆրանսիայի նախագահը։", + "Որն է Միացյալ Նահանգների մայրաքաղաքը։", + "Ե՞րբ է ծնվել Բարաք Օբաման։", +] diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py new file mode 100644 index 000000000..7c1b9592f --- /dev/null +++ b/spacy/lang/hy/lex_attrs.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "զրօ", + "մէկ", + "երկու", + "երեք", + "չորս", + "հինգ", + "վեց", + "յոթ", + "ութ", + "ինը", + "տասը", + "տասնմեկ", + "տասներկու", + "տասն­երեք", + "տասն­չորս", + "տասն­հինգ", + "տասն­վեց", + "տասն­յոթ", + "տասն­ութ", + "տասն­ինը", + "քսան" "երեսուն", + "քառասուն", + "հիսուն", + "վաթցսուն", + "յոթանասուն", + "ութսուն", + "ինիսուն", + "հարյուր", + "հազար", + "միլիոն", + "միլիարդ", + "տրիլիոն", + "քվինտիլիոն", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py new file mode 100644 index 000000000..c671956a4 --- /dev/null +++ b/spacy/lang/hy/stop_words.py @@ -0,0 +1,110 @@ +from __future__ import unicode_literals + + +STOP_WORDS = set( + """ +նա +ողջը +այստեղ +ենք +նա +էիր +որպես +ուրիշ +բոլորը +այն +այլ +նույնչափ +էի +մի +և +ողջ +ես +ոմն +հետ +նրանք +ամենքը +ըստ +ինչ-ինչ +այսպես +համայն +մի +նաև +նույնքան +դա +ովևէ +համար +այնտեղ +էին +որոնք +սույն +ինչ-որ +ամենը +նույնպիսի +ու +իր +որոշ +միևնույն +ի +այնպիսի +մենք +ամեն ոք +նույն +երբևէ +այն +որևէ +ին +այդպես +նրա +որը +վրա +դու +էինք +այդպիսի +էիք +յուրաքանչյուրը +եմ +պիտի +այդ +ամբողջը +հետո +եք +ամեն +այլ +կամ +այսքան +որ +այնպես +այսինչ +բոլոր +է +մեկնումեկը +այդչափ +այնքան +ամբողջ +երբևիցե +այնչափ +ամենայն +մյուս +այնինչ +իսկ +այդտեղ +այս +սա +են +ամեն ինչ +որևիցե +ում +մեկը +այդ +դուք +այսչափ +այդքան +այսպիսի +էր +յուրաքանչյուր +այս +մեջ +թ +""".split() +) diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py new file mode 100644 index 000000000..90690c22e --- /dev/null +++ b/spacy/lang/hy/tag_map.py @@ -0,0 +1,2478 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN +from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ + +TAG_MAP = { + "ADJ_Abbr=Yes": {POS: ADJ, "Abbr": "Yes"}, + "ADJ_Degree=Pos|NumForm=Word|NumType=Ord": { + POS: ADJ, + "Degree": "Pos", + "NumForm": "Word", + "NumType": "Ord", + }, + "ADJ_Degree=Pos": {POS: ADJ, "Degree": "Pos"}, + "ADJ_Degree=Pos|Style=Coll": {POS: ADJ, "Degree": "Pos", "Style": "Coll"}, + "ADJ_Degree=Pos|Style=Expr": {POS: ADJ, "Degree": "Pos", "Style": "Expr"}, + "ADJ_Degree=Sup": {POS: ADJ, "Degree": "Sup"}, + "ADJ_NumForm=Digit|NumType=Ord": {POS: ADJ, "NumForm": "Digit", "NumType": "Ord"}, + "ADJ_NumForm=Word|NumType=Card": {POS: ADJ, "NumForm": "Word", "NumType": "Card"}, + "ADJ_NumForm=Word|NumType=Ord": {POS: ADJ, "NumForm": "Word", "NumType": "Ord"}, + "ADJ_Style=Coll": {POS: ADJ, "Style": "Coll"}, + "ADJ_Style=Expr": {POS: ADJ, "Style": "Expr"}, + "ADP_AdpType=Post|Case=Dat": {POS: ADP, "AdpType": "Post", "Case": "Dat"}, + "ADP_AdpType=Post|Case=Nom": {POS: ADP, "AdpType": "Post", "Case": "Nom"}, + "ADP_AdpType=Post|Number=Plur|Person=3": { + POS: ADP, + "AdpType": "Post", + "Number": "Plur", + "Person": "3", + }, + "ADP_AdpType=Post": {POS: ADP, "AdpType": "Post"}, + "ADP_AdpType=Prep": {POS: ADP, "AdpType": "Prep"}, + "ADP_AdpType=Prep|Style=Arch": {POS: ADP, "AdpType": "Prep", "Style": "Arch"}, + "ADV_Degree=Cmp": {POS: ADV, "Degree": "Cmp"}, + "ADV_Degree=Pos": {POS: ADV, "Degree": "Pos"}, + "ADV_Degree=Sup": {POS: ADV, "Degree": "Sup"}, + "ADV_Distance=Dist|PronType=Dem": {POS: ADV, "Distance": "Dist", "PronType": "Dem"}, + "ADV_Distance=Dist|PronType=Exc": {POS: ADV, "Distance": "Dist", "PronType": "Exc"}, + "ADV_Distance=Med|PronType=Dem": {POS: ADV, "Distance": "Med", "PronType": "Dem"}, + "ADV_Distance=Med|PronType=Dem|Style=Coll": { + POS: ADV, + "Distance": "Med", + "PronType": "Dem", + "Style": "Coll", + }, + "ADV_NumForm=Word|NumType=Card|PronType=Tot": { + POS: ADV, + "NumForm": "Word", + "NumType": "Card", + "PronType": "Tot", + }, + "ADV_PronType=Dem": {POS: ADV, "PronType": "Dem"}, + "ADV_PronType=Exc": {POS: ADV, "PronType": "Exc"}, + "ADV_PronType=Ind": {POS: ADV, "PronType": "Ind"}, + "ADV_PronType=Int": {POS: ADV, "PronType": "Int"}, + "ADV_PronType=Int|Style=Coll": {POS: ADV, "PronType": "Int", "Style": "Coll"}, + "ADV_PronType=Rel": {POS: ADV, "PronType": "Rel"}, + "ADV_Style=Coll": {POS: ADV, "Style": "Coll"}, + "ADV_Style=Rare": {POS: ADV, "Style": "Rare"}, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "1", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "2", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|VerbForm=Part": {POS: AUX, "Aspect": "Imp", "VerbForm": "Part"}, + "AUX_Aspect=Perf|VerbForm=Part": {POS: AUX, "Aspect": "Perf", "VerbForm": "Part"}, + "AUX_Aspect=Prosp|VerbForm=Part": {POS: AUX, "Aspect": "Prosp", "VerbForm": "Part"}, + "AUX_Polarity=Pos": {POS: AUX, "Polarity": "Pos"}, + "CCONJ_ConjType=Comp": {POS: CCONJ, "ConjType": "Comp"}, + "CCONJ_ConjType=Comp|Style=Coll": {POS: CCONJ, "ConjType": "Comp", "Style": "Coll"}, + "DET_Case=Gen|Distance=Med|Number=Plur|Poss=Yes|PronType=Dem": { + POS: DET, + "Case": "Gen", + "Distance": "Med", + "Number": "Plur", + "Poss": "Yes", + "PronType": "Dem", + }, + "DET_Case=Gen|Distance=Med|Number=Sing|Poss=Yes|PronType=Dem": { + POS: DET, + "Case": "Gen", + "Distance": "Med", + "Number": "Sing", + "Poss": "Yes", + "PronType": "Dem", + }, + "DET_Case=Gen|Number=Plur|Person=1|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "1", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Plur|Person=2|Polite=Infm|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "2", + "Polite": "Infm", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Plur|Person=3|Poss=Yes|PronType=Emp": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + }, + "DET_Case=Gen|Number=Plur|Person=3|Poss=Yes|PronType=Emp|Reflex=Yes": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + "Reflex": "Yes", + }, + "DET_Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "1", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Sing|Person=2|Polite=Infm|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Emp": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + }, + "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Emp|Reflex=Yes": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + "Reflex": "Yes", + }, + "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Sing|Poss=Yes|PronType=Rel": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Poss": "Yes", + "PronType": "Rel", + }, + "DET_Distance=Dist|PronType=Dem": {POS: DET, "Distance": "Dist", "PronType": "Dem"}, + "DET_Distance=Dist|PronType=Dem|Style=Coll": { + POS: DET, + "Distance": "Dist", + "PronType": "Dem", + "Style": "Coll", + }, + "DET_Distance=Dist|PronType=Dem|Style=Vrnc": { + POS: DET, + "Distance": "Dist", + "PronType": "Dem", + "Style": "Vrnc", + }, + "DET_Distance=Med|PronType=Dem": {POS: DET, "Distance": "Med", "PronType": "Dem"}, + "DET_Distance=Med|PronType=Dem|Style=Coll": { + POS: DET, + "Distance": "Med", + "PronType": "Dem", + "Style": "Coll", + }, + "DET_Distance=Prox|PronType=Dem": {POS: DET, "Distance": "Prox", "PronType": "Dem"}, + "DET_Distance=Prox|PronType=Dem|Style=Coll": { + POS: DET, + "Distance": "Prox", + "PronType": "Dem", + "Style": "Coll", + }, + "DET_PronType=Art": {POS: DET, "PronType": "Art"}, + "DET_PronType=Exc": {POS: DET, "PronType": "Exc"}, + "DET_PronType=Ind": {POS: DET, "PronType": "Ind"}, + "DET_PronType=Int": {POS: DET, "PronType": "Int"}, + "DET_PronType=Tot": {POS: DET, "PronType": "Tot"}, + "DET_PronType=Tot|Style=Arch": {POS: DET, "PronType": "Tot", "Style": "Arch"}, + "INTJ_Style=Vrnc": {POS: INTJ, "Style": "Vrnc"}, + "NOUN_Abbr=Yes|Animacy=Nhum|Case=Dat|Definite=Ind|Number=Plur": { + POS: NOUN, + "Abbr": "Yes", + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Abbr=Yes|Animacy=Nhum|Case=Nom|Definite=Ind|Number=Sing": { + POS: NOUN, + "Abbr": "Yes", + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Def|Number=Sing|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Assoc": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Assoc", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Sing|Style=Arch": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "Style": "Arch", + }, + "NOUN_Animacy=Hum|Case=Dat|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Hum|Case=Dat|Number=Sing|Number=Sing|Person=1|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Ins|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Assoc": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Assoc", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Typo=Yes": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + "Typo": "Yes", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Nom|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Sing|Style=Arch": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + "Style": "Arch", + }, + "NOUN_Animacy=Nhum|Case=Abl|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|NumForm=Digit": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "NumForm": "Digit", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|NumForm=Word": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "NumForm": "Word", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|Style=Rare": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Style": "Rare", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|Style=Vrnc": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Style": "Vrnc", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing|NumForm=Digit": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "NumForm": "Digit", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing|Style=Vrnc": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "Style": "Vrnc", + }, + "NOUN_Animacy=Nhum|Case=Dat|Number=Coll|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Number": "Coll", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Dat|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Dat|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Gen|Definite=Ind|Number=Sing|Style=Arch": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Gen", + "Definite": "Ind", + "Number": "Sing", + "Style": "Arch", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Ins|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Loc|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Loc", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Loc|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Loc", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Loc|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Loc", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Plur|Number=Sing|Poss=Yes": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + "Number": "Sing", + "Poss": "Yes", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Sing|NumForm=Digit": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "NumForm": "Digit", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Coll|Typo=Yes": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Coll", + "Typo": "Yes", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + }, + "NOUN_Animacy=Nhum|Case=Nom|Number=Plur|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Number": "Plur", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Nom|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Nom|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NUM_NumForm=Digit|NumType=Card": {POS: NUM, "NumForm": "Digit", "NumType": "Card"}, + "NUM_NumForm=Digit|NumType=Frac|Typo=Yes": { + POS: NUM, + "NumForm": "Digit", + "NumType": "Frac", + "Typo": "Yes", + }, + "NUM_NumForm=Digit|NumType=Range": { + POS: NUM, + "NumForm": "Digit", + "NumType": "Range", + }, + "NUM_NumForm=Word|NumType=Card": {POS: NUM, "NumForm": "Word", "NumType": "Card"}, + "NUM_NumForm=Word|NumType=Dist": {POS: NUM, "NumForm": "Word", "NumType": "Dist"}, + "NUM_NumForm=Word|NumType=Range": {POS: NUM, "NumForm": "Word", "NumType": "Range"}, + "PART_Polarity=Neg": {POS: PART, "Polarity": "Neg"}, + "PRON_Case=Abl|Definite=Ind|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Abl|Number=Plur|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Abl", + "Number": "Plur", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Abl|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Abl", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Dat|Definite=Def|Distance=Dist|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Dat", + "Definite": "Def", + "Distance": "Dist", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Dat|Definite=Def|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Dat|Definite=Ind|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Dat|Distance=Dist|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Dat", + "Distance": "Dist", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Dat|Distance=Med|Number=Plur|PronType=Dem": { + POS: PRON, + "Case": "Dat", + "Distance": "Med", + "Number": "Plur", + "PronType": "Dem", + }, + "PRON_Case=Dat|Number=Plur|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Plur|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Plur|Person=3|PronType=Emp|Reflex=Yes": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "3", + "PronType": "Emp", + "Reflex": "Yes", + }, + "PRON_Case=Dat|Number=Plur|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Plur|PronType=Rcp": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "PronType": "Rcp", + }, + "PRON_Case=Dat|Number=Sing|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Sing|Person=3|PronType=Emp": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "3", + "PronType": "Emp", + }, + "PRON_Case=Dat|Number=Sing|Person=3|PronType=Emp|Reflex=Yes": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "3", + "PronType": "Emp", + "Reflex": "Yes", + }, + "PRON_Case=Dat|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Dat|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Dat|PronType=Tot": {POS: PRON, "Case": "Dat", "PronType": "Tot"}, + "PRON_Case=Gen|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Gen", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Gen|Number=Plur|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Gen", + "Number": "Plur", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Gen|Number=Sing|Person=2|PronType=Prs": { + POS: PRON, + "Case": "Gen", + "Number": "Sing", + "Person": "2", + "PronType": "Prs", + }, + "PRON_Case=Gen|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Gen|PronType=Tot": {POS: PRON, "Case": "Gen", "PronType": "Tot"}, + "PRON_Case=Ins|Definite=Ind|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Ins|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Ins", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Loc|Definite=Ind|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Loc", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Loc|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Loc", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Nom|Definite=Def|Distance=Dist|Number=Plur|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Distance": "Dist", + "Number": "Plur", + "PronType": "Dem", + }, + "PRON_Case=Nom|Definite=Def|Distance=Med|Number=Sing|PronType=Dem|Style=Coll": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + "Style": "Coll", + }, + "PRON_Case=Nom|Definite=Def|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Nom|Definite=Def|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Nom|Definite=Ind|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Nom|Definite=Ind|Number=Sing|PronType=Neg": { + POS: PRON, + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Neg", + }, + "PRON_Case=Nom|Definite=Ind|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Nom|Distance=Dist|Number=Plur|Person=1|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Dist", + "Number": "Plur", + "Person": "1", + "PronType": "Dem", + }, + "PRON_Case=Nom|Distance=Med|Number=Plur|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Med", + "Number": "Plur", + "PronType": "Dem", + }, + "PRON_Case=Nom|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Nom|Distance=Prox|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Prox", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Nom|Number=Plur|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Plur|Person=3|PronType=Emp": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "Person": "3", + "PronType": "Emp", + }, + "PRON_Case=Nom|Number=Plur|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Plur|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "PronType": "Rel", + }, + "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Number": "Plur", + "Person": "3", + "Person": "1", + "PronType": "Emp", + }, + "PRON_Case=Nom|Number=Sing|Person=1|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "1", + "PronType": "Int", + }, + "PRON_Case=Nom|Number=Sing|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Sing|Person=3|PronType=Emp": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "3", + "PronType": "Emp", + }, + "PRON_Case=Nom|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Nom|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Nom|Person=1|PronType=Tot": { + POS: PRON, + "Case": "Nom", + "Person": "1", + "PronType": "Tot", + }, + "PRON_Case=Nom|PronType=Ind": {POS: PRON, "Case": "Nom", "PronType": "Ind"}, + "PRON_Case=Nom|PronType=Tot": {POS: PRON, "Case": "Nom", "PronType": "Tot"}, + "PRON_Distance=Dist|Number=Sing|PronType=Dem": { + POS: PRON, + "Distance": "Dist", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Distance=Med|PronType=Dem|Style=Coll": { + POS: PRON, + "Distance": "Med", + "PronType": "Dem", + "Style": "Coll", + }, + "PRON_Distance=Prox|PronType=Dem|Style=Coll": { + POS: PRON, + "Distance": "Prox", + "PronType": "Dem", + "Style": "Coll", + }, + "PRON_Number=Plur|PronType=Rel": {POS: PRON, "Number": "Plur", "PronType": "Rel"}, + "PROPN_Abbr=Yes|Animacy=Hum|Case=Nom|Definite=Ind|NameType=Giv|Number=Sing": { + POS: PROPN, + "Abbr": "Yes", + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Giv", + "Number": "Sing", + }, + "PROPN_Abbr=Yes|Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Com|Number=Sing": { + POS: PROPN, + "Abbr": "Yes", + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Com", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Dat|Definite=Def|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Dat|Definite=Ind|NameType=Prs|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Prs", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Dat|Definite=Ind|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Def|NameType=Giv|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Giv", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Def|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Ind|NameType=Giv|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Giv", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Ind|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|NameType=Geo|Number=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Plur": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + }, + "PROPN_Animacy=Nhum|Case=Dat|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Dat|Definite=Ind|NameType=Geo|Number=Sing|Style=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + "Style": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Loc|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Loc", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Def|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Def|NameType=Pro|Number=Sing|Style=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Pro", + "Number": "Sing", + "Style": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Sing|Style=Vrnc": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + "Style": "Vrnc", + }, + "SCONJ_Style=Coll": {POS: SCONJ, "Style": "Coll"}, + "VERB_Aspect=Dur|Polarity=Neg|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Style=Coll|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Style": "Coll", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Style=Vrnc|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Style": "Vrnc", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Imp|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Subcat=Tran|VerbForm=Part|Voice=Cau": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Cau", + }, + "VERB_Aspect=Iter|Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Aspect": "Iter", + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Aspect=Iter|Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Aspect": "Iter", + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Aspect=Iter": {POS: VERB, "Aspect": "Iter"}, + "VERB_Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Style=Vrnc|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Style": "Vrnc", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Style=Vrnc|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Style": "Vrnc", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Polarity=Neg|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Polarity=Pos|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Subcat=Tran|VerbForm=Part|Voice=Cau": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Cau", + }, + "VERB_Aspect=Prog|Subcat=Intr|VerbForm=Conv|Voice=Mid": { + POS: VERB, + "Aspect": "Prog", + "Subcat": "Intr", + "VerbForm": "Conv", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Connegative=Yes|Mood=Cnd|Subcat=Tran|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Connegative": "Yes", + "Mood": "Cnd", + "Subcat": "Tran", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Plur|Person=3|Polarity=Pos|Style=Vrnc|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Style": "Vrnc", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Plur|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=1|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Pass": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Pass", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Imp|Number=Sing|Person=2|Subcat=Intr|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Imp", + "Number": "Sing", + "Person": "2", + "Subcat": "Intr", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Imp|Number=Sing|Person=2|Subcat=Tran|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Imp", + "Number": "Sing", + "Person": "2", + "Subcat": "Tran", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Plur|Person=1|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Plur", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Plur|Person=3|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Plur|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Imp|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|VerbForm=Fin|Voice=Pass": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Fin", + "Voice": "Pass", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Person=1|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Prosp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Prosp|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Case=Abl|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Abl|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Pass": { + POS: VERB, + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Pass", + }, + "VERB_Case=Abl|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Case=Dat|Definite=Def|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Dat", + "Definite": "Def", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Dat|Definite=Ind|Number=Coll|Polarity=Neg|Subcat=Intr|VerbForm=Gdv|Voice=Pass": { + POS: VERB, + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Pass", + }, + "VERB_Case=Dat|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Dat|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Case=Nom|Definite=Def|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Nom", + "Definite": "Def", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Nom|Definite=Def|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Nom", + "Definite": "Def", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Mood=Imp|Number=Sing|Person=2|Subcat=Intr|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Mood": "Imp", + "Number": "Sing", + "Person": "2", + "Subcat": "Intr", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Polarity=Neg|Subcat=Intr|VerbForm=Inf|Voice=Mid": { + POS: VERB, + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Inf", + "Voice": "Mid", + }, + "VERB_Polarity=Pos|Style=Coll|Subcat=Tran|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Style": "Coll", + "Subcat": "Tran", + "VerbForm": "Inf", + "Voice": "Act", + }, + "VERB_Polarity=Pos|Style=Vrnc|Subcat=Tran|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Style": "Vrnc", + "Subcat": "Tran", + "VerbForm": "Inf", + "Voice": "Act", + }, + "VERB_Polarity=Pos|Subcat=Intr|VerbForm=Inf|Voice=Mid": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Inf", + "Voice": "Mid", + }, + "VERB_Polarity=Pos|Subcat=Intr|VerbForm=Inf|Voice=Pass": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Inf", + "Voice": "Pass", + }, + "VERB_Polarity=Pos|Subcat=Tran|Typo=Yes|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Tran", + "Typo": "Yes", + "VerbForm": "Inf", + "Voice": "Act", + }, + "VERB_Polarity=Pos|Subcat=Tran|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Inf", + "Voice": "Act", + }, + "X_Foreign=Yes": {POS: X, "Foreign": "Yes"}, + "X_Style=Vrnc": {POS: X, "Style": "Vrnc"}, +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index fc89c2658..43c3152a0 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -234,3 +234,7 @@ def yo_tokenizer(): def zh_tokenizer(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() + +@pytest.fixture(scope="session") +def hy_tokenizer(): + return get_lang_class("hy").Defaults.create_tokenizer() \ No newline at end of file diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py new file mode 100644 index 000000000..6b785bdfc --- /dev/null +++ b/spacy/tests/lang/hy/test_text.py @@ -0,0 +1,10 @@ +from __future__ import unicode_literals + +import pytest +from spacy.lang.hy.lex_attrs import like_num + + +@pytest.mark.parametrize("word", ["հիսուն"]) +def test_hy_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py new file mode 100644 index 000000000..424fb886f --- /dev/null +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import pytest + + +# TODO add test cases with valid punctuation signs. + +hy_tokenize_text_test = [ + ( + "Մետաղագիտությունը պայմանականորեն բաժանվում է տեսականի և կիրառականի (տեխնիկական)", + [ + "Մետաղագիտությունը", + "պայմանականորեն", + "բաժանվում", + "է", + "տեսականի", + "և", + "կիրառականի", + "(", + "տեխնիկական", + ")", + ], + ), + ( + "Գետաբերանը գտնվում է Օմոլոնա գետի ձախ ափից 726 կմ հեռավորության վրա", + [ + "Գետաբերանը", + "գտնվում", + "է", + "Օմոլոնա", + "գետի", + "ձախ", + "ափից", + "726", + "կմ", + "հեռավորության", + "վրա", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", hy_tokenize_text_test) +def test_ga_tokenizer_handles_exception_cases(hy_tokenizer, text, expected_tokens): + tokens = hy_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From ddf3c2430d2a6ea4e1d3ed6ac15740f116134cea Mon Sep 17 00:00:00 2001 From: nlptechbook <60931109+nlptechbook@users.noreply.github.com> Date: Fri, 3 Apr 2020 12:10:03 -0400 Subject: [PATCH 029/131] Update universe.json --- website/meta/universe.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 8f8bcfecd..6c9fc0340 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1624,8 +1624,9 @@ "title": "pic2phrase_bot: Photo Description Generator", "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy." - "thumb": "https://drive.google.com/open?id=1GTrpPzc8j4mAmYCJZibYrADAp0GWcVHd", - "image": "https://drive.google.com/open?id=1t7URKJ-4uOJmZb_GbNvw-F5LLtvEoBRy", + "thumb": "https://i.imgur.com/ggVI02O.jpg", + "image": "https://i.imgur.com/z1yhWQR.jpg", + "url": "https://telegram.me/pic2phrase_bot", "author": "Yuli Vasiliev", "author_links": { "twitter": "VasilievYuli", From 406d5748b39919dc3996fad1e41e92735f50be45 Mon Sep 17 00:00:00 2001 From: Muhammad Irfan Date: Sun, 5 Apr 2020 20:55:38 +0500 Subject: [PATCH 030/131] add missing Urdu tags --- spacy/lang/ur/tag_map.py | 126 ++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 49 deletions(-) diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index 2499d7e3e..eebd3a14a 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -1,66 +1,94 @@ # coding: utf8 from __future__ import unicode_literals +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON TAG_MAP = { - ".": {POS: PUNCT, "PunctType": "peri"}, - ",": {POS: PUNCT, "PunctType": "comm"}, - "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, - "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, - "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, - '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, - "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "JJ-Ez": {POS: ADJ}, + "INJC": {POS: X}, + "QFC": {POS: DET}, + "UNK": {POS: X}, + "NSTC": {POS: ADV}, + "NST": {POS: ADV}, + "VMC": {POS: VERB}, + "PRPC": {POS: PRON}, + "RBC": {POS: ADV}, + "PSPC": {POS: ADP}, + "INJ": {POS: X}, + "JJZ": {POS: ADJ}, + "CCC": {POS: SCONJ}, + "NN-Ez": {POS: NOUN}, + "ECH": {POS: NOUN}, + "WQ": {POS: DET}, + "RDP": {POS: ADJ}, + "JJC": {POS: ADJ}, + "NEG": {POS: PART}, + "NNZ": {POS: NOUN}, + "QO": {POS: ADJ}, + "INTFC": {POS: ADV}, + "INTF": {POS: ADV}, + "NFC": {POS: ADP}, + "QCC": {POS: NUM}, + "QC": {POS: NUM}, + "QF": {POS: DET}, + "VAUX": {POS: AUX}, + "VM": {POS: VERB}, + "DEM": {POS: DET}, + "NNPC": {POS: PROPN}, + "NNC": {POS: NOUN}, + "PSP": {POS: ADP}, + + ".": {POS: PUNCT}, + ",": {POS: PUNCT}, + "-LRB-": {POS: PUNCT}, + "-RRB-": {POS: PUNCT}, + "``": {POS: PUNCT}, + '""': {POS: PUNCT}, + "''": {POS: PUNCT}, ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, - "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CCONJ, "ConjType": "coor"}, - "CD": {POS: NUM, "NumType": "card"}, + "$": {POS: SYM}, + "#": {POS: SYM}, + "AFX": {POS: ADJ}, + "CC": {POS: CCONJ}, + "CD": {POS: NUM}, "DT": {POS: DET}, - "EX": {POS: ADV, "AdvType": "ex"}, - "FW": {POS: X, "Foreign": "yes"}, - "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "EX": {POS: ADV}, + "FW": {POS: X}, + "HYPH": {POS: PUNCT}, "IN": {POS: ADP}, - "JJ": {POS: ADJ, "Degree": "pos"}, - "JJR": {POS: ADJ, "Degree": "comp"}, - "JJS": {POS: ADJ, "Degree": "sup"}, - "LS": {POS: PUNCT, "NumType": "ord"}, - "MD": {POS: VERB, "VerbType": "mod"}, + "JJ": {POS: ADJ}, + "JJR": {POS: ADJ}, + "JJS": {POS: ADJ}, + "LS": {POS: PUNCT}, + "MD": {POS: VERB}, "NIL": {POS: ""}, - "NN": {POS: NOUN, "Number": "sing"}, - "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, - "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, - "NNS": {POS: NOUN, "Number": "plur"}, - "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, - "POS": {POS: PART, "Poss": "yes"}, - "PRP": {POS: PRON, "PronType": "prs"}, - "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, - "RB": {POS: ADV, "Degree": "pos"}, - "RBR": {POS: ADV, "Degree": "comp"}, - "RBS": {POS: ADV, "Degree": "sup"}, + "NN": {POS: NOUN}, + "NNP": {POS: PROPN}, + "NNPS": {POS: PROPN}, + "NNS": {POS: NOUN}, + "PDT": {POS: ADJ}, + "POS": {POS: PART}, + "PRP": {POS: PRON}, + "PRP$": {POS: ADJ}, + "RB": {POS: ADV}, + "RBR": {POS: ADV}, + "RBS": {POS: ADV}, "RP": {POS: PART}, "SP": {POS: SPACE}, "SYM": {POS: SYM}, - "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "TO": {POS: PART}, "UH": {POS: INTJ}, - "VB": {POS: VERB, "VerbForm": "inf"}, - "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, - "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, - "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, - "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, - "VBZ": { - POS: VERB, - "VerbForm": "fin", - "Tense": "pres", - "Number": "sing", - "Person": 3, - }, - "WDT": {POS: ADJ, "PronType": "int|rel"}, - "WP": {POS: NOUN, "PronType": "int|rel"}, - "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, - "WRB": {POS: ADV, "PronType": "int|rel"}, + "VB": {POS: VERB}, + "VBD": {POS: VERB}, + "VBG": {POS: VERB}, + "VBN": {POS: VERB}, + "VBP": {POS: VERB}, + "VBZ": {POS: VERB}, + "WDT": {POS: ADJ}, + "WP": {POS: NOUN}, + "WP$": {POS: ADJ}, + "WRB": {POS: ADV}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, From f329d5663a3caca726bf820307448c361e346016 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Mon, 6 Apr 2020 11:29:30 +0200 Subject: [PATCH 031/131] add "whatlies" to spaCy universe (#5252) * Add "whatlies" We're releasing it on our side officially on the 16th of April. If possible, let's announce around the same time :) * sign contributor thing * Added fancy gif as the image * Update universe.json Spellin error and spaCy clarification. --- .github/contributors/koaning.md | 106 ++++++++++++++++++++++++++++++++ website/meta/universe.json | 28 +++++++++ 2 files changed, 134 insertions(+) create mode 100644 .github/contributors/koaning.md diff --git a/.github/contributors/koaning.md b/.github/contributors/koaning.md new file mode 100644 index 000000000..ddb28cab0 --- /dev/null +++ b/.github/contributors/koaning.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Vincent D. Warmerdam | +| Company name (if applicable) | | +| Title or role (if applicable) | Data Person | +| Date | 2020-03-01 | +| GitHub username | koaning | +| Website (optional) | https://koaning.io | diff --git a/website/meta/universe.json b/website/meta/universe.json index 613648d8c..bbd67e8a6 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,33 @@ { "resources": [ + { + "id": "whatlies", + "title": "whatlies", + "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.", + "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.", + "github": "rasahq/whatlies", + "pip": "whatlies", + "thumb": "https://i.imgur.com/rOkOiLv.png", + "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif", + "code_example": [ + "from whatlies import EmbeddingSet", + "from whatlies.language import SpacyLanguage", + "", + "lang = SpacyLanguage('en_core_web_md')", + "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', ', + 'king', 'queen', 'doctor', 'nurse']", + "", + "emb = lang[words]", + "emb.plot_interactive(x_axis='man', y_axis='woman')" + ], + "category": ["visualizers", "research"], + "author": "Vincent D. Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning", + "website": "https://koaning.io" + } + }, { "id": "spacy-stanza", "title": "spacy-stanza", From f4ef64a5264d4cd7f57059150cebeda388dd202d Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:18:07 +0200 Subject: [PATCH 032/131] Improve tokenization for UD Dutch corpora (#5259) * Improve tokenization for UD Dutch corpora Improve tokenization for UD Dutch Alpino and LassySmall. * Format Dutch tokenizer exceptions --- spacy/lang/nl/__init__.py | 4 +- spacy/lang/nl/punctuation.py | 41 +- spacy/lang/nl/tokenizer_exceptions.py | 1890 +++++++++++++++++++++---- 3 files changed, 1611 insertions(+), 324 deletions(-) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 074fd9133..407d23f73 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -5,7 +5,8 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -25,6 +26,7 @@ class DutchDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py index a48ecc044..e7207038b 100644 --- a/spacy/lang/nl/punctuation.py +++ b/spacy/lang/nl/punctuation.py @@ -1,10 +1,14 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_UNITS, merge_chars +from ..char_classes import LIST_PUNCT, LIST_QUOTES, CURRENCY, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import TOKENIZER_SUFFIXES as DEFAULT_TOKENIZER_SUFFIXES +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES + + +_prefixes = [",,"] + BASE_TOKENIZER_PREFIXES # Copied from `de` package. Main purpose is to ensure that hyphens are not @@ -22,20 +26,33 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), - r"(?<=[0-9])-(?=[0-9])", ] ) -# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when -# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid -# ambiguity it's better to just leave it attached when it occurs as a suffix. -default_suffix_blacklist = ("'s", "'S", "’s", "’S") -_suffixes = [ - suffix - for suffix in DEFAULT_TOKENIZER_SUFFIXES - if suffix not in default_suffix_blacklist -] +_list_units = [u for u in LIST_UNITS if u != "%"] +_units = merge_chars(" ".join(_list_units)) +_suffixes = ( + ["''"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + + +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py index dbdd104f3..c0915f127 100644 --- a/spacy/lang/nl/tokenizer_exceptions.py +++ b/spacy/lang/nl/tokenizer_exceptions.py @@ -16,317 +16,1585 @@ from ...symbols import ORTH # are extremely domain-specific. Tokenizer performance may benefit from some # slight pruning, although no performance regression has been observed so far. -# fmt: off -abbrevs = ['a.2d.', 'a.a.', 'a.a.j.b.', 'a.f.t.', 'a.g.j.b.', - 'a.h.v.', 'a.h.w.', 'a.hosp.', 'a.i.', 'a.j.b.', 'a.j.t.', - 'a.m.', 'a.m.r.', 'a.p.m.', 'a.p.r.', 'a.p.t.', 'a.s.', - 'a.t.d.f.', 'a.u.b.', 'a.v.a.', 'a.w.', 'aanbev.', - 'aanbev.comm.', 'aant.', 'aanv.st.', 'aanw.', 'vnw.', - 'aanw.vnw.', 'abd.', 'abm.', 'abs.', 'acc.act.', - 'acc.bedr.m.', 'acc.bedr.t.', 'achterv.', 'act.dr.', - 'act.dr.fam.', 'act.fisc.', 'act.soc.', 'adm.akk.', - 'adm.besl.', 'adm.lex.', 'adm.onderr.', 'adm.ov.', 'adv.', - 'adv.', 'gen.', 'adv.bl.', 'afd.', 'afl.', 'aggl.verord.', - 'agr.', 'al.', 'alg.', 'alg.richts.', 'amén.', 'ann.dr.', - 'ann.dr.lg.', 'ann.dr.sc.pol.', 'ann.ét.eur.', - 'ann.fac.dr.lg.', 'ann.jur.créd.', - 'ann.jur.créd.règl.coll.', 'ann.not.', 'ann.parl.', - 'ann.prat.comm.', 'app.', 'arb.', 'aud.', 'arbbl.', - 'arbh.', 'arbit.besl.', 'arbrb.', 'arr.', 'arr.cass.', - 'arr.r.v.st.', 'arr.verbr.', 'arrondrb.', 'art.', 'artw.', - 'aud.', 'b.', 'b.', 'b.&w.', 'b.a.', 'b.a.s.', 'b.b.o.', - 'b.best.dep.', 'b.br.ex.', 'b.coll.fr.gem.comm.', - 'b.coll.vl.gem.comm.', 'b.d.cult.r.', 'b.d.gem.ex.', - 'b.d.gem.reg.', 'b.dep.', 'b.e.b.', 'b.f.r.', - 'b.fr.gem.ex.', 'b.fr.gem.reg.', 'b.i.h.', 'b.inl.j.d.', - 'b.inl.s.reg.', 'b.j.', 'b.l.', 'b.o.z.', 'b.prov.r.', - 'b.r.h.', 'b.s.', 'b.sr.', 'b.stb.', 'b.t.i.r.', - 'b.t.s.z.', 'b.t.w.rev.', 'b.v.', - 'b.ver.coll.gem.gem.comm.', 'b.verg.r.b.', 'b.versl.', - 'b.vl.ex.', 'b.voorl.reg.', 'b.w.', 'b.w.gew.ex.', - 'b.z.d.g.', 'b.z.v.', 'bab.', 'bedr.org.', 'begins.', - 'beheersov.', 'bekendm.comm.', 'bel.', 'bel.besch.', - 'bel.w.p.', 'beleidsov.', 'belg.', 'grondw.', 'ber.', - 'ber.w.', 'besch.', 'besl.', 'beslagr.', 'bestuurswet.', - 'bet.', 'betr.', 'betr.', 'vnw.', 'bevest.', 'bew.', - 'bijbl.', 'ind.', 'eig.', 'bijbl.n.bijdr.', 'bijl.', - 'bijv.', 'bijw.', 'bijz.decr.', 'bin.b.', 'bkh.', 'bl.', - 'blz.', 'bm.', 'bn.', 'rh.', 'bnw.', 'bouwr.', 'br.parl.', - 'bs.', 'bull.', 'bull.adm.pénit.', 'bull.ass.', - 'bull.b.m.m.', 'bull.bel.', 'bull.best.strafinr.', - 'bull.bmm.', 'bull.c.b.n.', 'bull.c.n.c.', 'bull.cbn.', - 'bull.centr.arb.', 'bull.cnc.', 'bull.contr.', - 'bull.doc.min.fin.', 'bull.f.e.b.', 'bull.feb.', - 'bull.fisc.fin.r.', 'bull.i.u.m.', - 'bull.inf.ass.secr.soc.', 'bull.inf.i.e.c.', - 'bull.inf.i.n.a.m.i.', 'bull.inf.i.r.e.', 'bull.inf.iec.', - 'bull.inf.inami.', 'bull.inf.ire.', 'bull.inst.arb.', - 'bull.ium.', 'bull.jur.imm.', 'bull.lég.b.', 'bull.off.', - 'bull.trim.b.dr.comp.', 'bull.us.', 'bull.v.b.o.', - 'bull.vbo.', 'bv.', 'bw.', 'bxh.', 'byz.', 'c.', 'c.a.', - 'c.a.-a.', 'c.a.b.g.', 'c.c.', 'c.c.i.', 'c.c.s.', - 'c.conc.jur.', 'c.d.e.', 'c.d.p.k.', 'c.e.', 'c.ex.', - 'c.f.', 'c.h.a.', 'c.i.f.', 'c.i.f.i.c.', 'c.j.', 'c.l.', - 'c.n.', 'c.o.d.', 'c.p.', 'c.pr.civ.', 'c.q.', 'c.r.', - 'c.r.a.', 'c.s.', 'c.s.a.', 'c.s.q.n.', 'c.v.', 'c.v.a.', - 'c.v.o.', 'ca.', 'cadeaust.', 'cah.const.', - 'cah.dr.europ.', 'cah.dr.immo.', 'cah.dr.jud.', 'cal.', - '2d.', 'cal.', '3e.', 'cal.', 'rprt.', 'cap.', 'carg.', - 'cass.', 'cass.', 'verw.', 'cert.', 'cf.', 'ch.', 'chron.', - 'chron.d.s.', 'chron.dr.not.', 'cie.', 'cie.', - 'verz.schr.', 'cir.', 'circ.', 'circ.z.', 'cit.', - 'cit.loc.', 'civ.', 'cl.et.b.', 'cmt.', 'co.', - 'cognoss.v.', 'coll.', 'v.', 'b.', 'colp.w.', 'com.', - 'com.', 'cas.', 'com.v.min.', 'comm.', 'comm.', 'v.', - 'comm.bijz.ov.', 'comm.erf.', 'comm.fin.', 'comm.ger.', - 'comm.handel.', 'comm.pers.', 'comm.pub.', 'comm.straf.', - 'comm.v.', 'comm.venn.', 'comm.verz.', 'comm.voor.', - 'comp.', 'compt.w.', 'computerr.', 'con.m.', 'concl.', - 'concr.', 'conf.', 'confl.w.', 'confl.w.huwbetr.', 'cons.', - 'conv.', 'coöp.', 'ver.', 'corr.', 'corr.bl.', - 'cour.fisc.', 'cour.immo.', 'cridon.', 'crim.', 'cur.', - 'cur.', 'crt.', 'curs.', 'd.', 'd.-g.', 'd.a.', 'd.a.v.', - 'd.b.f.', 'd.c.', 'd.c.c.r.', 'd.d.', 'd.d.p.', 'd.e.t.', - 'd.gem.r.', 'd.h.', 'd.h.z.', 'd.i.', 'd.i.t.', 'd.j.', - 'd.l.r.', 'd.m.', 'd.m.v.', 'd.o.v.', 'd.parl.', 'd.w.z.', - 'dact.', 'dat.', 'dbesch.', 'dbesl.', 'decr.', 'decr.d.', - 'decr.fr.', 'decr.vl.', 'decr.w.', 'def.', 'dep.opv.', - 'dep.rtl.', 'derg.', 'desp.', 'det.mag.', 'deurw.regl.', - 'dez.', 'dgl.', 'dhr.', 'disp.', 'diss.', 'div.', - 'div.act.', 'div.bel.', 'dl.', 'dln.', 'dnotz.', 'doc.', - 'hist.', 'doc.jur.b.', 'doc.min.fin.', 'doc.parl.', - 'doctr.', 'dpl.', 'dpl.besl.', 'dr.', 'dr.banc.fin.', - 'dr.circ.', 'dr.inform.', 'dr.mr.', 'dr.pén.entr.', - 'dr.q.m.', 'drs.', 'dtp.', 'dwz.', 'dyn.', 'e.', 'e.a.', - 'e.b.', 'tek.mod.', 'e.c.', 'e.c.a.', 'e.d.', 'e.e.', - 'e.e.a.', 'e.e.g.', 'e.g.', 'e.g.a.', 'e.h.a.', 'e.i.', - 'e.j.', 'e.m.a.', 'e.n.a.c.', 'e.o.', 'e.p.c.', 'e.r.c.', - 'e.r.f.', 'e.r.h.', 'e.r.o.', 'e.r.p.', 'e.r.v.', - 'e.s.r.a.', 'e.s.t.', 'e.v.', 'e.v.a.', 'e.w.', 'e&o.e.', - 'ec.pol.r.', 'econ.', 'ed.', 'ed(s).', 'eff.', 'eig.', - 'eig.mag.', 'eil.', 'elektr.', 'enmb.', 'enz.', 'err.', - 'etc.', 'etq.', 'eur.', 'parl.', 'eur.t.s.', 'ev.', 'evt.', - 'ex.', 'ex.crim.', 'exec.', 'f.', 'f.a.o.', 'f.a.q.', - 'f.a.s.', 'f.i.b.', 'f.j.f.', 'f.o.b.', 'f.o.r.', 'f.o.s.', - 'f.o.t.', 'f.r.', 'f.supp.', 'f.suppl.', 'fa.', 'facs.', - 'fasc.', 'fg.', 'fid.ber.', 'fig.', 'fin.verh.w.', 'fisc.', - 'fisc.', 'tijdschr.', 'fisc.act.', 'fisc.koer.', 'fl.', - 'form.', 'foro.', 'it.', 'fr.', 'fr.cult.r.', 'fr.gem.r.', - 'fr.parl.', 'fra.', 'ft.', 'g.', 'g.a.', 'g.a.v.', - 'g.a.w.v.', 'g.g.d.', 'g.m.t.', 'g.o.', 'g.omt.e.', 'g.p.', - 'g.s.', 'g.v.', 'g.w.w.', 'geb.', 'gebr.', 'gebrs.', - 'gec.', 'gec.decr.', 'ged.', 'ged.st.', 'gedipl.', - 'gedr.st.', 'geh.', 'gem.', 'gem.', 'gem.', - 'gem.gem.comm.', 'gem.st.', 'gem.stem.', 'gem.w.', - 'gemeensch.optr.', 'gemeensch.standp.', 'gemeensch.strat.', - 'gemeent.', 'gemeent.b.', 'gemeent.regl.', - 'gemeent.verord.', 'geol.', 'geopp.', 'gepubl.', - 'ger.deurw.', 'ger.w.', 'gerekw.', 'gereq.', 'gesch.', - 'get.', 'getr.', 'gev.m.', 'gev.maatr.', 'gew.', 'ghert.', - 'gir.eff.verk.', 'gk.', 'gr.', 'gramm.', 'grat.w.', - 'grootb.w.', 'grs.', 'grvm.', 'grw.', 'gst.', 'gw.', - 'h.a.', 'h.a.v.o.', 'h.b.o.', 'h.e.a.o.', 'h.e.g.a.', - 'h.e.geb.', 'h.e.gestr.', 'h.l.', 'h.m.', 'h.o.', 'h.r.', - 'h.t.l.', 'h.t.m.', 'h.w.geb.', 'hand.', 'handelsn.w.', - 'handelspr.', 'handelsr.w.', 'handelsreg.w.', 'handv.', - 'harv.l.rev.', 'hc.', 'herald.', 'hert.', 'herz.', - 'hfdst.', 'hfst.', 'hgrw.', 'hhr.', 'hist.', 'hooggel.', - 'hoogl.', 'hosp.', 'hpw.', 'hr.', 'hr.', 'ms.', 'hr.ms.', - 'hregw.', 'hrg.', 'hst.', 'huis.just.', 'huisv.w.', - 'huurbl.', 'hv.vn.', 'hw.', 'hyp.w.', 'i.b.s.', 'i.c.', - 'i.c.m.h.', 'i.e.', 'i.f.', 'i.f.p.', 'i.g.v.', 'i.h.', - 'i.h.a.', 'i.h.b.', 'i.l.pr.', 'i.o.', 'i.p.o.', 'i.p.r.', - 'i.p.v.', 'i.pl.v.', 'i.r.d.i.', 'i.s.m.', 'i.t.t.', - 'i.v.', 'i.v.m.', 'i.v.s.', 'i.w.tr.', 'i.z.', 'ib.', - 'ibid.', 'icip-ing.cons.', 'iem.', 'indic.soc.', 'indiv.', - 'inf.', 'inf.i.d.a.c.', 'inf.idac.', 'inf.r.i.z.i.v.', - 'inf.riziv.', 'inf.soc.secr.', 'ing.', 'ing.', 'cons.', - 'ing.cons.', 'inst.', 'int.', 'int.', 'rechtsh.', - 'strafz.', 'interm.', 'intern.fisc.act.', - 'intern.vervoerr.', 'inv.', 'inv.', 'f.', 'inv.w.', - 'inv.wet.', 'invord.w.', 'inz.', 'ir.', 'irspr.', 'iwtr.', - 'j.', 'j.-cl.', 'j.c.b.', 'j.c.e.', 'j.c.fl.', 'j.c.j.', - 'j.c.p.', 'j.d.e.', 'j.d.f.', 'j.d.s.c.', 'j.dr.jeun.', - 'j.j.d.', 'j.j.p.', 'j.j.pol.', 'j.l.', 'j.l.m.b.', - 'j.l.o.', 'j.p.a.', 'j.r.s.', 'j.t.', 'j.t.d.e.', - 'j.t.dr.eur.', 'j.t.o.', 'j.t.t.', 'jaarl.', 'jb.hand.', - 'jb.kred.', 'jb.kred.c.s.', 'jb.l.r.b.', 'jb.lrb.', - 'jb.markt.', 'jb.mens.', 'jb.t.r.d.', 'jb.trd.', - 'jeugdrb.', 'jeugdwerkg.w.', 'jg.', 'jis.', 'jl.', - 'journ.jur.', 'journ.prat.dr.fisc.fin.', 'journ.proc.', - 'jrg.', 'jur.', 'jur.comm.fl.', 'jur.dr.soc.b.l.n.', - 'jur.f.p.e.', 'jur.fpe.', 'jur.niv.', 'jur.trav.brux.', - 'jurambt.', 'jv.cass.', 'jv.h.r.j.', 'jv.hrj.', 'jw.', - 'k.', 'k.', 'k.b.', 'k.g.', 'k.k.', 'k.m.b.o.', 'k.o.o.', - 'k.v.k.', 'k.v.v.v.', 'kadasterw.', 'kaderb.', 'kador.', - 'kbo-nr.', 'kg.', 'kh.', 'kiesw.', 'kind.bes.v.', 'kkr.', - 'koopv.', 'kr.', 'krankz.w.', 'ksbel.', 'kt.', 'ktg.', - 'ktr.', 'kvdm.', 'kw.r.', 'kymr.', 'kzr.', 'kzw.', 'l.', - 'l.b.', 'l.b.o.', 'l.bas.', 'l.c.', 'l.gew.', 'l.j.', - 'l.k.', 'l.l.', 'l.o.', 'l.r.b.', 'l.u.v.i.', 'l.v.r.', - 'l.v.w.', 'l.w.', "l'exp.-compt.b..", 'l’exp.-compt.b.', - 'landinr.w.', 'landscrt.', 'lat.', 'law.ed.', 'lett.', - 'levensverz.', 'lgrs.', 'lidw.', 'limb.rechtsl.', 'lit.', - 'litt.', 'liw.', 'liwet.', 'lk.', 'll.', 'll.(l.)l.r.', - 'loonw.', 'losbl.', 'ltd.', 'luchtv.', 'luchtv.w.', 'm.', - 'm.', 'not.', 'm.a.v.o.', 'm.a.w.', 'm.b.', 'm.b.o.', - 'm.b.r.', 'm.b.t.', 'm.d.g.o.', 'm.e.a.o.', 'm.e.r.', - 'm.h.', 'm.h.d.', 'm.i.v.', 'm.j.t.', 'm.k.', 'm.m.', - 'm.m.a.', 'm.m.h.h.', 'm.m.v.', 'm.n.', 'm.not.fisc.', - 'm.nt.', 'm.o.', 'm.r.', 'm.s.a.', 'm.u.p.', 'm.v.a.', - 'm.v.h.n.', 'm.v.t.', 'm.z.', 'maatr.teboekgest.luchtv.', - 'maced.', 'mand.', 'max.', 'mbl.not.', 'me.', 'med.', - 'med.', 'v.b.o.', 'med.b.u.f.r.', 'med.bufr.', 'med.vbo.', - 'meerv.', 'meetbr.w.', 'mém.adm.', 'mgr.', 'mgrs.', 'mhd.', - 'mi.verantw.', 'mil.', 'mil.bed.', 'mil.ger.', 'min.', - 'min.', 'aanbev.', 'min.', 'circ.', 'min.', 'fin.', - 'min.j.omz.', 'min.just.circ.', 'mitt.', 'mnd.', 'mod.', - 'mon.', 'mouv.comm.', 'mr.', 'ms.', 'muz.', 'mv.', 'n.', - 'chr.', 'n.a.', 'n.a.g.', 'n.a.v.', 'n.b.', 'n.c.', - 'n.chr.', 'n.d.', 'n.d.r.', 'n.e.a.', 'n.g.', 'n.h.b.c.', - 'n.j.', 'n.j.b.', 'n.j.w.', 'n.l.', 'n.m.', 'n.m.m.', - 'n.n.', 'n.n.b.', 'n.n.g.', 'n.n.k.', 'n.o.m.', 'n.o.t.k.', - 'n.rapp.', 'n.tijd.pol.', 'n.v.', 'n.v.d.r.', 'n.v.d.v.', - 'n.v.o.b.', 'n.v.t.', 'nat.besch.w.', 'nat.omb.', - 'nat.pers.', 'ned.cult.r.', 'neg.verkl.', 'nhd.', 'wisk.', - 'njcm-bull.', 'nl.', 'nnd.', 'no.', 'not.fisc.m.', - 'not.w.', 'not.wet.', 'nr.', 'nrs.', 'nste.', 'nt.', - 'numism.', 'o.', 'o.a.', 'o.b.', 'o.c.', 'o.g.', 'o.g.v.', - 'o.i.', 'o.i.d.', 'o.m.', 'o.o.', 'o.o.d.', 'o.o.v.', - 'o.p.', 'o.r.', 'o.regl.', 'o.s.', 'o.t.s.', 'o.t.t.', - 'o.t.t.t.', 'o.t.t.z.', 'o.tk.t.', 'o.v.t.', 'o.v.t.t.', - 'o.v.tk.t.', 'o.v.v.', 'ob.', 'obsv.', 'octr.', - 'octr.gem.regl.', 'octr.regl.', 'oe.', 'off.pol.', 'ofra.', - 'ohd.', 'omb.', 'omnil.', 'omz.', 'on.ww.', 'onderr.', - 'onfrank.', 'onteig.w.', 'ontw.', 'b.w.', 'onuitg.', - 'onz.', 'oorl.w.', 'op.cit.', 'opin.pa.', 'opm.', 'or.', - 'ord.br.', 'ord.gem.', 'ors.', 'orth.', 'os.', 'osm.', - 'ov.', 'ov.w.i.', 'ov.w.ii.', 'ov.ww.', 'overg.w.', - 'overw.', 'ovkst.', 'oz.', 'p.', 'p.a.', 'p.a.o.', - 'p.b.o.', 'p.e.', 'p.g.', 'p.j.', 'p.m.', 'p.m.a.', 'p.o.', - 'p.o.j.t.', 'p.p.', 'p.v.', 'p.v.s.', 'pachtw.', 'pag.', - 'pan.', 'pand.b.', 'pand.pér.', 'parl.gesch.', - 'parl.gesch.', 'inv.', 'parl.st.', 'part.arb.', 'pas.', - 'pasin.', 'pat.', 'pb.c.', 'pb.l.', 'pens.', - 'pensioenverz.', 'per.ber.i.b.r.', 'per.ber.ibr.', 'pers.', - 'st.', 'pft.', 'pk.', 'pktg.', 'plv.', 'po.', 'pol.', - 'pol.off.', 'pol.r.', 'pol.w.', 'postbankw.', 'postw.', - 'pp.', 'pr.', 'preadv.', 'pres.', 'prf.', 'prft.', 'prg.', - 'prijz.w.', 'proc.', 'procesregl.', 'prof.', 'prot.', - 'prov.', 'prov.b.', 'prov.instr.h.m.g.', 'prov.regl.', - 'prov.verord.', 'prov.w.', 'publ.', 'pun.', 'pw.', - 'q.b.d.', 'q.e.d.', 'q.q.', 'q.r.', 'r.', 'r.a.b.g.', - 'r.a.c.e.', 'r.a.j.b.', 'r.b.d.c.', 'r.b.d.i.', 'r.b.s.s.', - 'r.c.', 'r.c.b.', 'r.c.d.c.', 'r.c.j.b.', 'r.c.s.j.', - 'r.cass.', 'r.d.c.', 'r.d.i.', 'r.d.i.d.c.', 'r.d.j.b.', - 'r.d.j.p.', 'r.d.p.c.', 'r.d.s.', 'r.d.t.i.', 'r.e.', - 'r.f.s.v.p.', 'r.g.a.r.', 'r.g.c.f.', 'r.g.d.c.', 'r.g.f.', - 'r.g.z.', 'r.h.a.', 'r.i.c.', 'r.i.d.a.', 'r.i.e.j.', - 'r.i.n.', 'r.i.s.a.', 'r.j.d.a.', 'r.j.i.', 'r.k.', 'r.l.', - 'r.l.g.b.', 'r.med.', 'r.med.rechtspr.', 'r.n.b.', 'r.o.', - 'r.ov.', 'r.p.', 'r.p.d.b.', 'r.p.o.t.', 'r.p.r.j.', - 'r.p.s.', 'r.r.d.', 'r.r.s.', 'r.s.', 'r.s.v.p.', - 'r.stvb.', 'r.t.d.f.', 'r.t.d.h.', 'r.t.l.', - 'r.trim.dr.eur.', 'r.v.a.', 'r.verkb.', 'r.w.', 'r.w.d.', - 'rap.ann.c.a.', 'rap.ann.c.c.', 'rap.ann.c.e.', - 'rap.ann.c.s.j.', 'rap.ann.ca.', 'rap.ann.cass.', - 'rap.ann.cc.', 'rap.ann.ce.', 'rap.ann.csj.', 'rapp.', - 'rb.', 'rb.kh.', 'rdn.', 'rdnr.', 're.pers.', 'rec.', - 'rec.c.i.j.', 'rec.c.j.c.e.', 'rec.cij.', 'rec.cjce.', - 'rec.gén.enr.not.', 'rechtsk.t.', 'rechtspl.zeem.', - 'rechtspr.arb.br.', 'rechtspr.b.f.e.', 'rechtspr.bfe.', - 'rechtspr.soc.r.b.l.n.', 'recl.reg.', 'rect.', 'red.', - 'reg.', 'reg.huiz.bew.', 'reg.w.', 'registr.w.', 'regl.', - 'regl.', 'r.v.k.', 'regl.besl.', 'regl.onderr.', - 'regl.r.t.', 'rep.', 'rép.fisc.', 'rép.not.', 'rep.r.j.', - 'rep.rj.', 'req.', 'res.', 'resp.', 'rev.', 'rev.', - 'comp.', 'rev.', 'trim.', 'civ.', 'rev.', 'trim.', 'comm.', - 'rev.acc.trav.', 'rev.adm.', 'rev.b.compt.', - 'rev.b.dr.const.', 'rev.b.dr.intern.', 'rev.b.séc.soc.', - 'rev.banc.fin.', 'rev.comm.', 'rev.cons.prud.', - 'rev.dr.b.', 'rev.dr.commun.', 'rev.dr.étr.', - 'rev.dr.fam.', 'rev.dr.intern.comp.', 'rev.dr.mil.', - 'rev.dr.min.', 'rev.dr.pén.', 'rev.dr.pén.mil.', - 'rev.dr.rur.', 'rev.dr.u.l.b.', 'rev.dr.ulb.', 'rev.exp.', - 'rev.faill.', 'rev.fisc.', 'rev.gd.', 'rev.hist.dr.', - 'rev.i.p.c.', 'rev.ipc.', 'rev.not.b.', - 'rev.prat.dr.comm.', 'rev.prat.not.b.', 'rev.prat.soc.', - 'rev.rec.', 'rev.rw.', 'rev.trav.', 'rev.trim.d.h.', - 'rev.trim.dr.fam.', 'rev.urb.', 'richtl.', 'riv.dir.int.', - 'riv.dir.int.priv.proc.', 'rk.', 'rln.', 'roln.', 'rom.', - 'rondz.', 'rov.', 'rtl.', 'rubr.', 'ruilv.wet.', - 'rv.verdr.', 'rvkb.', 's.', 's.', 's.a.', 's.b.n.', - 's.ct.', 's.d.', 's.e.c.', 's.e.et.o.', 's.e.w.', - 's.exec.rept.', 's.hrg.', 's.j.b.', 's.l.', 's.l.e.a.', - 's.l.n.d.', 's.p.a.', 's.s.', 's.t.', 's.t.b.', 's.v.', - 's.v.p.', 'samenw.', 'sc.', 'sch.', 'scheidsr.uitspr.', - 'schepel.besl.', 'secr.comm.', 'secr.gen.', 'sect.soc.', - 'sess.', 'cas.', 'sir.', 'soc.', 'best.', 'soc.', 'handv.', - 'soc.', 'verz.', 'soc.act.', 'soc.best.', 'soc.kron.', - 'soc.r.', 'soc.sw.', 'soc.weg.', 'sofi-nr.', 'somm.', - 'somm.ann.', 'sp.c.c.', 'sr.', 'ss.', 'st.doc.b.c.n.a.r.', - 'st.doc.bcnar.', 'st.vw.', 'stagever.', 'stas.', 'stat.', - 'stb.', 'stbl.', 'stcrt.', 'stud.dipl.', 'su.', 'subs.', - 'subst.', 'succ.w.', 'suppl.', 'sv.', 'sw.', 't.', 't.a.', - 't.a.a.', 't.a.n.', 't.a.p.', 't.a.s.n.', 't.a.v.', - 't.a.v.w.', 't.aann.', 't.acc.', 't.agr.r.', 't.app.', - 't.b.b.r.', 't.b.h.', 't.b.m.', 't.b.o.', 't.b.p.', - 't.b.r.', 't.b.s.', 't.b.v.', 't.bankw.', 't.belg.not.', - 't.desk.', 't.e.m.', 't.e.p.', 't.f.r.', 't.fam.', - 't.fin.r.', 't.g.r.', 't.g.t.', 't.g.v.', 't.gem.', - 't.gez.', 't.huur.', 't.i.n.', 't.j.k.', 't.l.l.', - 't.l.v.', 't.m.', 't.m.r.', 't.m.w.', 't.mil.r.', - 't.mil.strafr.', 't.not.', 't.o.', 't.o.r.b.', 't.o.v.', - 't.ontv.', 't.p.r.', 't.pol.', 't.r.', 't.r.g.', - 't.r.o.s.', 't.r.v.', 't.s.r.', 't.strafr.', 't.t.', - 't.u.', 't.v.c.', 't.v.g.', 't.v.m.r.', 't.v.o.', 't.v.v.', - 't.v.v.d.b.', 't.v.w.', 't.verz.', 't.vred.', 't.vreemd.', - 't.w.', 't.w.k.', 't.w.v.', 't.w.v.r.', 't.wrr.', 't.z.', - 't.z.t.', 't.z.v.', 'taalk.', 'tar.burg.z.', 'td.', - 'techn.', 'telecomm.', 'toel.', 'toel.st.v.w.', 'toep.', - 'toep.regl.', 'tom.', 'top.', 'trans.b.', 'transp.r.', - 'trb.', 'trib.', 'trib.civ.', 'trib.gr.inst.', 'ts.', - 'ts.', 'best.', 'ts.', 'verv.', 'turnh.rechtsl.', 'tvpol.', - 'tvpr.', 'tvrechtsgesch.', 'tw.', 'u.', 'u.a.', 'u.a.r.', - 'u.a.v.', 'u.c.', 'u.c.c.', 'u.g.', 'u.p.', 'u.s.', - 'u.s.d.c.', 'uitdr.', 'uitl.w.', 'uitv.besch.div.b.', - 'uitv.besl.', 'uitv.besl.', 'succ.w.', 'uitv.besl.bel.rv.', - 'uitv.besl.l.b.', 'uitv.reg.', 'inv.w.', 'uitv.reg.bel.d.', - 'uitv.reg.afd.verm.', 'uitv.reg.lb.', 'uitv.reg.succ.w.', - 'univ.', 'univ.verkl.', 'v.', 'v.', 'chr.', 'v.a.', - 'v.a.v.', 'v.c.', 'v.chr.', 'v.h.', 'v.huw.verm.', 'v.i.', - 'v.i.o.', 'v.k.a.', 'v.m.', 'v.o.f.', 'v.o.n.', - 'v.onderh.verpl.', 'v.p.', 'v.r.', 'v.s.o.', 'v.t.t.', - 'v.t.t.t.', 'v.tk.t.', 'v.toep.r.vert.', 'v.v.b.', - 'v.v.g.', 'v.v.t.', 'v.v.t.t.', 'v.v.tk.t.', 'v.w.b.', - 'v.z.m.', 'vb.', 'vb.bo.', 'vbb.', 'vc.', 'vd.', 'veldw.', - 'ver.k.', 'ver.verg.gem.', 'gem.comm.', 'verbr.', 'verd.', - 'verdr.', 'verdr.v.', 'tek.mod.', 'verenw.', 'verg.', - 'verg.fr.gem.', 'comm.', 'verkl.', 'verkl.herz.gw.', - 'verl.', 'deelw.', 'vern.', 'verord.', 'vers.r.', - 'versch.', 'versl.c.s.w.', 'versl.csw.', 'vert.', 'verw.', - 'verz.', 'verz.w.', 'verz.wett.besl.', - 'verz.wett.decr.besl.', 'vgl.', 'vid.', 'viss.w.', - 'vl.parl.', 'vl.r.', 'vl.t.gez.', 'vl.w.reg.', - 'vl.w.succ.', 'vlg.', 'vn.', 'vnl.', 'vnw.', 'vo.', - 'vo.bl.', 'voegw.', 'vol.', 'volg.', 'volt.', 'deelw.', - 'voorl.', 'voorz.', 'vord.w.', 'vorst.d.', 'vr.', 'vred.', - 'vrg.', 'vnw.', 'vrijgrs.', 'vs.', 'vt.', 'vw.', 'vz.', - 'vzngr.', 'vzr.', 'w.', 'w.a.', 'w.b.r.', 'w.c.h.', - 'w.conf.huw.', 'w.conf.huwelijksb.', 'w.consum.kr.', - 'w.f.r.', 'w.g.', 'w.gew.r.', 'w.ident.pl.', 'w.just.doc.', - 'w.kh.', 'w.l.r.', 'w.l.v.', 'w.mil.straf.spr.', 'w.n.', - 'w.not.ambt.', 'w.o.', 'w.o.d.huurcomm.', 'w.o.d.k.', - 'w.openb.manif.', 'w.parl.', 'w.r.', 'w.reg.', 'w.succ.', - 'w.u.b.', 'w.uitv.pl.verord.', 'w.v.', 'w.v.k.', - 'w.v.m.s.', 'w.v.r.', 'w.v.w.', 'w.venn.', 'wac.', 'wd.', - 'wetb.', 'n.v.h.', 'wgb.', 'winkelt.w.', 'wisk.', - 'wka-verkl.', 'wnd.', 'won.w.', 'woningw.', 'woonr.w.', - 'wrr.', 'wrr.ber.', 'wrsch.', 'ws.', 'wsch.', 'wsr.', - 'wtvb.', 'ww.', 'x.d.', 'z.a.', 'z.g.', 'z.i.', 'z.j.', - 'z.o.z.', 'z.p.', 'z.s.m.', 'zg.', 'zgn.', 'zn.', 'znw.', - 'zr.', 'zr.', 'ms.', 'zr.ms.'] -# fmt: on +abbrevs = [ + "a.2d.", + "a.a.", + "a.a.j.b.", + "a.f.t.", + "a.g.j.b.", + "a.h.v.", + "a.h.w.", + "a.hosp.", + "a.i.", + "a.j.b.", + "a.j.t.", + "a.m.", + "a.m.r.", + "a.p.m.", + "a.p.r.", + "a.p.t.", + "a.s.", + "a.t.d.f.", + "a.u.b.", + "a.v.a.", + "a.w.", + "aanbev.", + "aanbev.comm.", + "aant.", + "aanv.st.", + "aanw.", + "vnw.", + "aanw.vnw.", + "abd.", + "abm.", + "abs.", + "acc.act.", + "acc.bedr.m.", + "acc.bedr.t.", + "achterv.", + "act.dr.", + "act.dr.fam.", + "act.fisc.", + "act.soc.", + "adm.akk.", + "adm.besl.", + "adm.lex.", + "adm.onderr.", + "adm.ov.", + "adv.", + "adv.", + "gen.", + "adv.bl.", + "afd.", + "afl.", + "aggl.verord.", + "agr.", + "al.", + "alg.", + "alg.richts.", + "amén.", + "ann.dr.", + "ann.dr.lg.", + "ann.dr.sc.pol.", + "ann.ét.eur.", + "ann.fac.dr.lg.", + "ann.jur.créd.", + "ann.jur.créd.règl.coll.", + "ann.not.", + "ann.parl.", + "ann.prat.comm.", + "app.", + "arb.", + "aud.", + "arbbl.", + "arbh.", + "arbit.besl.", + "arbrb.", + "arr.", + "arr.cass.", + "arr.r.v.st.", + "arr.verbr.", + "arrondrb.", + "art.", + "artw.", + "aud.", + "b.", + "b.", + "b.&w.", + "b.a.", + "b.a.s.", + "b.b.o.", + "b.best.dep.", + "b.br.ex.", + "b.coll.fr.gem.comm.", + "b.coll.vl.gem.comm.", + "b.d.cult.r.", + "b.d.gem.ex.", + "b.d.gem.reg.", + "b.dep.", + "b.e.b.", + "b.f.r.", + "b.fr.gem.ex.", + "b.fr.gem.reg.", + "b.i.h.", + "b.inl.j.d.", + "b.inl.s.reg.", + "b.j.", + "b.l.", + "b.o.z.", + "b.prov.r.", + "b.r.h.", + "b.s.", + "b.sr.", + "b.stb.", + "b.t.i.r.", + "b.t.s.z.", + "b.t.w.rev.", + "b.v.", + "b.ver.coll.gem.gem.comm.", + "b.verg.r.b.", + "b.versl.", + "b.vl.ex.", + "b.voorl.reg.", + "b.w.", + "b.w.gew.ex.", + "b.z.d.g.", + "b.z.v.", + "bab.", + "bedr.org.", + "begins.", + "beheersov.", + "bekendm.comm.", + "bel.", + "bel.besch.", + "bel.w.p.", + "beleidsov.", + "belg.", + "grondw.", + "ber.", + "ber.w.", + "besch.", + "besl.", + "beslagr.", + "bestuurswet.", + "bet.", + "betr.", + "betr.", + "vnw.", + "bevest.", + "bew.", + "bijbl.", + "ind.", + "eig.", + "bijbl.n.bijdr.", + "bijl.", + "bijv.", + "bijw.", + "bijz.decr.", + "bin.b.", + "bkh.", + "bl.", + "blz.", + "bm.", + "bn.", + "rh.", + "bnw.", + "bouwr.", + "br.parl.", + "bs.", + "bull.", + "bull.adm.pénit.", + "bull.ass.", + "bull.b.m.m.", + "bull.bel.", + "bull.best.strafinr.", + "bull.bmm.", + "bull.c.b.n.", + "bull.c.n.c.", + "bull.cbn.", + "bull.centr.arb.", + "bull.cnc.", + "bull.contr.", + "bull.doc.min.fin.", + "bull.f.e.b.", + "bull.feb.", + "bull.fisc.fin.r.", + "bull.i.u.m.", + "bull.inf.ass.secr.soc.", + "bull.inf.i.e.c.", + "bull.inf.i.n.a.m.i.", + "bull.inf.i.r.e.", + "bull.inf.iec.", + "bull.inf.inami.", + "bull.inf.ire.", + "bull.inst.arb.", + "bull.ium.", + "bull.jur.imm.", + "bull.lég.b.", + "bull.off.", + "bull.trim.b.dr.comp.", + "bull.us.", + "bull.v.b.o.", + "bull.vbo.", + "bv.", + "bw.", + "bxh.", + "byz.", + "c.", + "c.a.", + "c.a.-a.", + "c.a.b.g.", + "c.c.", + "c.c.i.", + "c.c.s.", + "c.conc.jur.", + "c.d.e.", + "c.d.p.k.", + "c.e.", + "c.ex.", + "c.f.", + "c.h.a.", + "c.i.f.", + "c.i.f.i.c.", + "c.j.", + "c.l.", + "c.n.", + "c.o.d.", + "c.p.", + "c.pr.civ.", + "c.q.", + "c.r.", + "c.r.a.", + "c.s.", + "c.s.a.", + "c.s.q.n.", + "c.v.", + "c.v.a.", + "c.v.o.", + "ca.", + "cadeaust.", + "cah.const.", + "cah.dr.europ.", + "cah.dr.immo.", + "cah.dr.jud.", + "cal.", + "2d.", + "cal.", + "3e.", + "cal.", + "rprt.", + "cap.", + "carg.", + "cass.", + "cass.", + "verw.", + "cert.", + "cf.", + "ch.", + "chron.", + "chron.d.s.", + "chron.dr.not.", + "cie.", + "cie.", + "verz.schr.", + "cir.", + "circ.", + "circ.z.", + "cit.", + "cit.loc.", + "civ.", + "cl.et.b.", + "cmt.", + "co.", + "cognoss.v.", + "coll.", + "v.", + "b.", + "colp.w.", + "com.", + "com.", + "cas.", + "com.v.min.", + "comm.", + "comm.", + "v.", + "comm.bijz.ov.", + "comm.erf.", + "comm.fin.", + "comm.ger.", + "comm.handel.", + "comm.pers.", + "comm.pub.", + "comm.straf.", + "comm.v.", + "comm.venn.", + "comm.verz.", + "comm.voor.", + "comp.", + "compt.w.", + "computerr.", + "con.m.", + "concl.", + "concr.", + "conf.", + "confl.w.", + "confl.w.huwbetr.", + "cons.", + "conv.", + "coöp.", + "ver.", + "corr.", + "corr.bl.", + "cour.fisc.", + "cour.immo.", + "cridon.", + "crim.", + "cur.", + "cur.", + "crt.", + "curs.", + "d.", + "d.-g.", + "d.a.", + "d.a.v.", + "d.b.f.", + "d.c.", + "d.c.c.r.", + "d.d.", + "d.d.p.", + "d.e.t.", + "d.gem.r.", + "d.h.", + "d.h.z.", + "d.i.", + "d.i.t.", + "d.j.", + "d.l.r.", + "d.m.", + "d.m.v.", + "d.o.v.", + "d.parl.", + "d.w.z.", + "dact.", + "dat.", + "dbesch.", + "dbesl.", + "dec.", + "decr.", + "decr.d.", + "decr.fr.", + "decr.vl.", + "decr.w.", + "def.", + "dep.opv.", + "dep.rtl.", + "derg.", + "desp.", + "det.mag.", + "deurw.regl.", + "dez.", + "dgl.", + "dhr.", + "disp.", + "diss.", + "div.", + "div.act.", + "div.bel.", + "dl.", + "dln.", + "dnotz.", + "doc.", + "hist.", + "doc.jur.b.", + "doc.min.fin.", + "doc.parl.", + "doctr.", + "dpl.", + "dpl.besl.", + "dr.", + "dr.banc.fin.", + "dr.circ.", + "dr.inform.", + "dr.mr.", + "dr.pén.entr.", + "dr.q.m.", + "drs.", + "ds.", + "dtp.", + "dwz.", + "dyn.", + "e.", + "e.a.", + "e.b.", + "tek.mod.", + "e.c.", + "e.c.a.", + "e.d.", + "e.e.", + "e.e.a.", + "e.e.g.", + "e.g.", + "e.g.a.", + "e.h.a.", + "e.i.", + "e.j.", + "e.m.a.", + "e.n.a.c.", + "e.o.", + "e.p.c.", + "e.r.c.", + "e.r.f.", + "e.r.h.", + "e.r.o.", + "e.r.p.", + "e.r.v.", + "e.s.r.a.", + "e.s.t.", + "e.v.", + "e.v.a.", + "e.w.", + "e&o.e.", + "ec.pol.r.", + "econ.", + "ed.", + "ed(s).", + "eff.", + "eig.", + "eig.mag.", + "eil.", + "elektr.", + "enmb.", + "enz.", + "err.", + "etc.", + "etq.", + "eur.", + "parl.", + "eur.t.s.", + "ev.", + "evt.", + "ex.", + "ex.crim.", + "exec.", + "f.", + "f.a.o.", + "f.a.q.", + "f.a.s.", + "f.i.b.", + "f.j.f.", + "f.o.b.", + "f.o.r.", + "f.o.s.", + "f.o.t.", + "f.r.", + "f.supp.", + "f.suppl.", + "fa.", + "facs.", + "fasc.", + "fg.", + "fid.ber.", + "fig.", + "fin.verh.w.", + "fisc.", + "fisc.", + "tijdschr.", + "fisc.act.", + "fisc.koer.", + "fl.", + "form.", + "foro.", + "it.", + "fr.", + "fr.cult.r.", + "fr.gem.r.", + "fr.parl.", + "fra.", + "ft.", + "g.", + "g.a.", + "g.a.v.", + "g.a.w.v.", + "g.g.d.", + "g.m.t.", + "g.o.", + "g.omt.e.", + "g.p.", + "g.s.", + "g.v.", + "g.w.w.", + "geb.", + "gebr.", + "gebrs.", + "gec.", + "gec.decr.", + "ged.", + "ged.st.", + "gedipl.", + "gedr.st.", + "geh.", + "gem.", + "gem.", + "gem.", + "gem.gem.comm.", + "gem.st.", + "gem.stem.", + "gem.w.", + "gemeensch.optr.", + "gemeensch.standp.", + "gemeensch.strat.", + "gemeent.", + "gemeent.b.", + "gemeent.regl.", + "gemeent.verord.", + "geol.", + "geopp.", + "gepubl.", + "ger.deurw.", + "ger.w.", + "gerekw.", + "gereq.", + "gesch.", + "get.", + "getr.", + "gev.m.", + "gev.maatr.", + "gew.", + "ghert.", + "gir.eff.verk.", + "gk.", + "gr.", + "gramm.", + "grat.w.", + "grootb.w.", + "grs.", + "grvm.", + "grw.", + "gst.", + "gw.", + "h.a.", + "h.a.v.o.", + "h.b.o.", + "h.e.a.o.", + "h.e.g.a.", + "h.e.geb.", + "h.e.gestr.", + "h.l.", + "h.m.", + "h.o.", + "h.r.", + "h.t.l.", + "h.t.m.", + "h.w.geb.", + "hand.", + "handelsn.w.", + "handelspr.", + "handelsr.w.", + "handelsreg.w.", + "handv.", + "harv.l.rev.", + "hc.", + "herald.", + "hert.", + "herz.", + "hfdst.", + "hfst.", + "hgrw.", + "hhr.", + "hist.", + "hooggel.", + "hoogl.", + "hosp.", + "hpw.", + "hr.", + "hr.", + "ms.", + "hr.ms.", + "hregw.", + "hrg.", + "hst.", + "huis.just.", + "huisv.w.", + "huurbl.", + "hv.vn.", + "hw.", + "hyp.w.", + "i.b.s.", + "i.c.", + "i.c.m.h.", + "i.e.", + "i.f.", + "i.f.p.", + "i.g.v.", + "i.h.", + "i.h.a.", + "i.h.b.", + "i.l.pr.", + "i.o.", + "i.p.o.", + "i.p.r.", + "i.p.v.", + "i.pl.v.", + "i.r.d.i.", + "i.s.m.", + "i.t.t.", + "i.v.", + "i.v.m.", + "i.v.s.", + "i.w.tr.", + "i.z.", + "ib.", + "ibid.", + "icip-ing.cons.", + "iem.", + "inc.", + "indic.soc.", + "indiv.", + "inf.", + "inf.i.d.a.c.", + "inf.idac.", + "inf.r.i.z.i.v.", + "inf.riziv.", + "inf.soc.secr.", + "ing.", + "ing.", + "cons.", + "ing.cons.", + "inst.", + "int.", + "int.", + "rechtsh.", + "strafz.", + "interm.", + "intern.fisc.act.", + "intern.vervoerr.", + "inv.", + "inv.", + "f.", + "inv.w.", + "inv.wet.", + "invord.w.", + "inz.", + "ir.", + "irspr.", + "iwtr.", + "j.", + "j.-cl.", + "j.c.b.", + "j.c.e.", + "j.c.fl.", + "j.c.j.", + "j.c.p.", + "j.d.e.", + "j.d.f.", + "j.d.s.c.", + "j.dr.jeun.", + "j.j.d.", + "j.j.p.", + "j.j.pol.", + "j.l.", + "j.l.m.b.", + "j.l.o.", + "j.p.a.", + "j.r.s.", + "j.t.", + "j.t.d.e.", + "j.t.dr.eur.", + "j.t.o.", + "j.t.t.", + "jaarl.", + "jb.hand.", + "jb.kred.", + "jb.kred.c.s.", + "jb.l.r.b.", + "jb.lrb.", + "jb.markt.", + "jb.mens.", + "jb.t.r.d.", + "jb.trd.", + "jeugdrb.", + "jeugdwerkg.w.", + "jhr.", + "jg.", + "jis.", + "jl.", + "journ.jur.", + "journ.prat.dr.fisc.fin.", + "journ.proc.", + "jr.", + "jrg.", + "jur.", + "jur.comm.fl.", + "jur.dr.soc.b.l.n.", + "jur.f.p.e.", + "jur.fpe.", + "jur.niv.", + "jur.trav.brux.", + "jurambt.", + "jv.cass.", + "jv.h.r.j.", + "jv.hrj.", + "jw.", + "k.", + "k.", + "k.b.", + "k.g.", + "k.k.", + "k.m.b.o.", + "k.o.o.", + "k.v.k.", + "k.v.v.v.", + "kadasterw.", + "kaderb.", + "kador.", + "kbo-nr.", + "kg.", + "kh.", + "kiesw.", + "kind.bes.v.", + "kkr.", + "kon.", + "koopv.", + "kr.", + "krankz.w.", + "ksbel.", + "kt.", + "ktg.", + "ktr.", + "kvdm.", + "kw.r.", + "kymr.", + "kzr.", + "kzw.", + "l.", + "l.b.", + "l.b.o.", + "l.bas.", + "l.c.", + "l.gew.", + "l.j.", + "l.k.", + "l.l.", + "l.o.", + "l.p.", + "l.r.b.", + "l.u.v.i.", + "l.v.r.", + "l.v.w.", + "l.w.", + "l'exp.-compt.b..", + "l’exp.-compt.b.", + "landinr.w.", + "landscrt.", + "lat.", + "law.ed.", + "lett.", + "levensverz.", + "lgrs.", + "lidw.", + "limb.rechtsl.", + "lit.", + "litt.", + "liw.", + "liwet.", + "lk.", + "ll.", + "ll.(l.)l.r.", + "loonw.", + "losbl.", + "ltd.", + "luchtv.", + "luchtv.w.", + "m.", + "m.", + "not.", + "m.a.v.o.", + "m.a.w.", + "m.b.", + "m.b.o.", + "m.b.r.", + "m.b.t.", + "m.d.g.o.", + "m.e.a.o.", + "m.e.r.", + "m.h.", + "m.h.d.", + "m.i.v.", + "m.j.t.", + "m.k.", + "m.m.", + "m.m.a.", + "m.m.h.h.", + "m.m.v.", + "m.n.", + "m.not.fisc.", + "m.nt.", + "m.o.", + "m.r.", + "m.s.a.", + "m.u.p.", + "m.v.a.", + "m.v.h.n.", + "m.v.t.", + "m.z.", + "maatr.teboekgest.luchtv.", + "maced.", + "mand.", + "max.", + "mbl.not.", + "me.", + "med.", + "med.", + "v.b.o.", + "med.b.u.f.r.", + "med.bufr.", + "med.vbo.", + "meerv.", + "meetbr.w.", + "mej.", + "mevr.", + "mém.adm.", + "mgr.", + "mgrs.", + "mhd.", + "mi.verantw.", + "mil.", + "mil.bed.", + "mil.ger.", + "min.", + "min.", + "aanbev.", + "min.", + "circ.", + "min.", + "fin.", + "min.j.omz.", + "min.just.circ.", + "mitt.", + "mln.", + "mnd.", + "mod.", + "mon.", + "mouv.comm.", + "mr.", + "ms.", + "muz.", + "mv.", + "n.", + "chr.", + "n.a.", + "n.a.g.", + "n.a.v.", + "n.b.", + "n.c.", + "n.chr.", + "n.d.", + "n.d.r.", + "n.e.a.", + "n.g.", + "n.h.b.c.", + "n.j.", + "n.j.b.", + "n.j.w.", + "n.l.", + "n.m.", + "n.m.m.", + "n.n.", + "n.n.b.", + "n.n.g.", + "n.n.k.", + "n.o.m.", + "n.o.t.k.", + "n.rapp.", + "n.tijd.pol.", + "n.v.", + "n.v.d.r.", + "n.v.d.v.", + "n.v.o.b.", + "n.v.t.", + "nat.besch.w.", + "nat.omb.", + "nat.pers.", + "ned.", + "ned.cult.r.", + "neg.verkl.", + "nhd.", + "wisk.", + "njcm-bull.", + "nl.", + "nnd.", + "no.", + "not.fisc.m.", + "not.w.", + "not.wet.", + "nr.", + "nrs.", + "nste.", + "nt.", + "numism.", + "o.", + "o.a.", + "o.b.", + "o.c.", + "o.g.", + "o.g.v.", + "o.i.", + "o.i.d.", + "o.m.", + "o.o.", + "o.o.d.", + "o.o.v.", + "o.p.", + "o.r.", + "o.regl.", + "o.s.", + "o.t.s.", + "o.t.t.", + "o.t.t.t.", + "o.t.t.z.", + "o.tk.t.", + "o.v.t.", + "o.v.t.t.", + "o.v.tk.t.", + "o.v.v.", + "ob.", + "obsv.", + "octr.", + "octr.gem.regl.", + "octr.regl.", + "oe.", + "off.pol.", + "ofra.", + "ohd.", + "omb.", + "omnil.", + "omz.", + "on.ww.", + "onderr.", + "onfrank.", + "onteig.w.", + "ontw.", + "b.w.", + "onuitg.", + "onz.", + "oorl.w.", + "op.cit.", + "opin.pa.", + "opm.", + "or.", + "ord.br.", + "ord.gem.", + "ors.", + "orth.", + "os.", + "osm.", + "ov.", + "ov.w.i.", + "ov.w.ii.", + "ov.ww.", + "overg.w.", + "overw.", + "ovkst.", + "oz.", + "p.", + "p.a.", + "p.a.o.", + "p.b.o.", + "p.e.", + "p.g.", + "p.j.", + "p.m.", + "p.m.a.", + "p.o.", + "p.o.j.t.", + "p.p.", + "p.v.", + "p.v.s.", + "pachtw.", + "pag.", + "pan.", + "pand.b.", + "pand.pér.", + "parl.gesch.", + "parl.gesch.", + "inv.", + "parl.st.", + "part.arb.", + "pas.", + "pasin.", + "pat.", + "pb.c.", + "pb.l.", + "pct.", + "pens.", + "pensioenverz.", + "per.ber.i.b.r.", + "per.ber.ibr.", + "pers.", + "st.", + "pft.", + "pk.", + "pktg.", + "plv.", + "po.", + "pol.", + "pol.off.", + "pol.r.", + "pol.w.", + "postbankw.", + "postw.", + "pp.", + "pr.", + "preadv.", + "pres.", + "prf.", + "prft.", + "prg.", + "prijz.w.", + "proc.", + "procesregl.", + "prof.", + "prot.", + "prov.", + "prov.b.", + "prov.instr.h.m.g.", + "prov.regl.", + "prov.verord.", + "prov.w.", + "publ.", + "pun.", + "pw.", + "q.b.d.", + "q.e.d.", + "q.q.", + "q.r.", + "r.", + "r.a.b.g.", + "r.a.c.e.", + "r.a.j.b.", + "r.b.d.c.", + "r.b.d.i.", + "r.b.s.s.", + "r.c.", + "r.c.b.", + "r.c.d.c.", + "r.c.j.b.", + "r.c.s.j.", + "r.cass.", + "r.d.c.", + "r.d.i.", + "r.d.i.d.c.", + "r.d.j.b.", + "r.d.j.p.", + "r.d.p.c.", + "r.d.s.", + "r.d.t.i.", + "r.e.", + "r.f.s.v.p.", + "r.g.a.r.", + "r.g.c.f.", + "r.g.d.c.", + "r.g.f.", + "r.g.z.", + "r.h.a.", + "r.i.c.", + "r.i.d.a.", + "r.i.e.j.", + "r.i.n.", + "r.i.s.a.", + "r.j.d.a.", + "r.j.i.", + "r.k.", + "r.l.", + "r.l.g.b.", + "r.med.", + "r.med.rechtspr.", + "r.n.b.", + "r.o.", + "r.ov.", + "r.p.", + "r.p.d.b.", + "r.p.o.t.", + "r.p.r.j.", + "r.p.s.", + "r.r.d.", + "r.r.s.", + "r.s.", + "r.s.v.p.", + "r.stvb.", + "r.t.d.f.", + "r.t.d.h.", + "r.t.l.", + "r.trim.dr.eur.", + "r.v.a.", + "r.verkb.", + "r.w.", + "r.w.d.", + "rap.ann.c.a.", + "rap.ann.c.c.", + "rap.ann.c.e.", + "rap.ann.c.s.j.", + "rap.ann.ca.", + "rap.ann.cass.", + "rap.ann.cc.", + "rap.ann.ce.", + "rap.ann.csj.", + "rapp.", + "rb.", + "rb.kh.", + "rdn.", + "rdnr.", + "re.pers.", + "rec.", + "rec.c.i.j.", + "rec.c.j.c.e.", + "rec.cij.", + "rec.cjce.", + "rec.gén.enr.not.", + "rechtsk.t.", + "rechtspl.zeem.", + "rechtspr.arb.br.", + "rechtspr.b.f.e.", + "rechtspr.bfe.", + "rechtspr.soc.r.b.l.n.", + "recl.reg.", + "rect.", + "red.", + "reg.", + "reg.huiz.bew.", + "reg.w.", + "registr.w.", + "regl.", + "regl.", + "r.v.k.", + "regl.besl.", + "regl.onderr.", + "regl.r.t.", + "rep.", + "rép.fisc.", + "rép.not.", + "rep.r.j.", + "rep.rj.", + "req.", + "res.", + "resp.", + "rev.", + "rev.", + "comp.", + "rev.", + "trim.", + "civ.", + "rev.", + "trim.", + "comm.", + "rev.acc.trav.", + "rev.adm.", + "rev.b.compt.", + "rev.b.dr.const.", + "rev.b.dr.intern.", + "rev.b.séc.soc.", + "rev.banc.fin.", + "rev.comm.", + "rev.cons.prud.", + "rev.dr.b.", + "rev.dr.commun.", + "rev.dr.étr.", + "rev.dr.fam.", + "rev.dr.intern.comp.", + "rev.dr.mil.", + "rev.dr.min.", + "rev.dr.pén.", + "rev.dr.pén.mil.", + "rev.dr.rur.", + "rev.dr.u.l.b.", + "rev.dr.ulb.", + "rev.exp.", + "rev.faill.", + "rev.fisc.", + "rev.gd.", + "rev.hist.dr.", + "rev.i.p.c.", + "rev.ipc.", + "rev.not.b.", + "rev.prat.dr.comm.", + "rev.prat.not.b.", + "rev.prat.soc.", + "rev.rec.", + "rev.rw.", + "rev.trav.", + "rev.trim.d.h.", + "rev.trim.dr.fam.", + "rev.urb.", + "richtl.", + "riv.dir.int.", + "riv.dir.int.priv.proc.", + "rk.", + "rln.", + "roln.", + "rom.", + "rondz.", + "rov.", + "rtl.", + "rubr.", + "ruilv.wet.", + "rv.verdr.", + "rvkb.", + "s.", + "s.", + "s.a.", + "s.b.n.", + "s.ct.", + "s.d.", + "s.e.c.", + "s.e.et.o.", + "s.e.w.", + "s.exec.rept.", + "s.hrg.", + "s.j.b.", + "s.l.", + "s.l.e.a.", + "s.l.n.d.", + "s.p.a.", + "s.s.", + "s.t.", + "s.t.b.", + "s.v.", + "s.v.p.", + "samenw.", + "sc.", + "sch.", + "scheidsr.uitspr.", + "schepel.besl.", + "sec.", + "secr.comm.", + "secr.gen.", + "sect.soc.", + "sess.", + "cas.", + "sir.", + "soc.", + "best.", + "soc.", + "handv.", + "soc.", + "verz.", + "soc.act.", + "soc.best.", + "soc.kron.", + "soc.r.", + "soc.sw.", + "soc.weg.", + "sofi-nr.", + "somm.", + "somm.ann.", + "sp.c.c.", + "sr.", + "ss.", + "st.doc.b.c.n.a.r.", + "st.doc.bcnar.", + "st.vw.", + "stagever.", + "stas.", + "stat.", + "stb.", + "stbl.", + "stcrt.", + "stud.dipl.", + "su.", + "subs.", + "subst.", + "succ.w.", + "suppl.", + "sv.", + "sw.", + "t.", + "t.a.", + "t.a.a.", + "t.a.n.", + "t.a.p.", + "t.a.s.n.", + "t.a.v.", + "t.a.v.w.", + "t.aann.", + "t.acc.", + "t.agr.r.", + "t.app.", + "t.b.b.r.", + "t.b.h.", + "t.b.m.", + "t.b.o.", + "t.b.p.", + "t.b.r.", + "t.b.s.", + "t.b.v.", + "t.bankw.", + "t.belg.not.", + "t.desk.", + "t.e.m.", + "t.e.p.", + "t.f.r.", + "t.fam.", + "t.fin.r.", + "t.g.r.", + "t.g.t.", + "t.g.v.", + "t.gem.", + "t.gez.", + "t.huur.", + "t.i.n.", + "t.j.k.", + "t.l.l.", + "t.l.v.", + "t.m.", + "t.m.r.", + "t.m.w.", + "t.mil.r.", + "t.mil.strafr.", + "t.not.", + "t.o.", + "t.o.r.b.", + "t.o.v.", + "t.ontv.", + "t.p.r.", + "t.pol.", + "t.r.", + "t.r.g.", + "t.r.o.s.", + "t.r.v.", + "t.s.r.", + "t.strafr.", + "t.t.", + "t.u.", + "t.v.c.", + "t.v.g.", + "t.v.m.r.", + "t.v.o.", + "t.v.v.", + "t.v.v.d.b.", + "t.v.w.", + "t.verz.", + "t.vred.", + "t.vreemd.", + "t.w.", + "t.w.k.", + "t.w.v.", + "t.w.v.r.", + "t.wrr.", + "t.z.", + "t.z.t.", + "t.z.v.", + "taalk.", + "tar.burg.z.", + "td.", + "techn.", + "telecomm.", + "th.", + "toel.", + "toel.st.v.w.", + "toep.", + "toep.regl.", + "tom.", + "top.", + "trans.b.", + "transp.r.", + "trb.", + "trib.", + "trib.civ.", + "trib.gr.inst.", + "ts.", + "ts.", + "best.", + "ts.", + "verv.", + "turnh.rechtsl.", + "tvpol.", + "tvpr.", + "tvrechtsgesch.", + "tw.", + "u.", + "u.a.", + "u.a.r.", + "u.a.v.", + "u.c.", + "u.c.c.", + "u.g.", + "u.p.", + "u.s.", + "u.s.d.c.", + "uitdr.", + "uitl.w.", + "uitv.besch.div.b.", + "uitv.besl.", + "uitv.besl.", + "succ.w.", + "uitv.besl.bel.rv.", + "uitv.besl.l.b.", + "uitv.reg.", + "inv.w.", + "uitv.reg.bel.d.", + "uitv.reg.afd.verm.", + "uitv.reg.lb.", + "uitv.reg.succ.w.", + "univ.", + "univ.verkl.", + "v.", + "v.", + "chr.", + "v.a.", + "v.a.v.", + "v.c.", + "v.C.", + "v.Chr.", + "v.chr.", + "v.d.", + "v.h.", + "v.huw.verm.", + "v.i.", + "v.i.o.", + "v.j.", + "v.k.a.", + "v.m.", + "v.o.f.", + "v.o.n.", + "v.onderh.verpl.", + "v.p.", + "v.r.", + "v.s.o.", + "v.t.t.", + "v.t.t.t.", + "v.tk.t.", + "v.toep.r.vert.", + "v.v.b.", + "v.v.g.", + "v.v.t.", + "v.v.t.t.", + "v.v.tk.t.", + "v.w.b.", + "v.z.m.", + "vb.", + "vb.bo.", + "vbb.", + "vc.", + "vd.", + "veldw.", + "ver.k.", + "ver.verg.gem.", + "gem.comm.", + "verbr.", + "verd.", + "verdr.", + "verdr.v.", + "tek.mod.", + "verenw.", + "verg.", + "verg.fr.gem.", + "comm.", + "verkl.", + "verkl.herz.gw.", + "verl.", + "deelw.", + "vern.", + "verord.", + "vers.r.", + "versch.", + "versl.c.s.w.", + "versl.csw.", + "vert.", + "verw.", + "verz.", + "verz.w.", + "verz.wett.besl.", + "verz.wett.decr.besl.", + "vgl.", + "vid.", + "viss.w.", + "vl.parl.", + "vl.r.", + "vl.t.gez.", + "vl.w.reg.", + "vl.w.succ.", + "vlg.", + "vn.", + "vnl.", + "vnw.", + "vo.", + "vo.bl.", + "voegw.", + "vol.", + "volg.", + "volt.", + "deelw.", + "voorl.", + "voorz.", + "vord.w.", + "vorst.d.", + "vr.", + "vred.", + "vrg.", + "vnw.", + "vrijgrs.", + "vs.", + "vt.", + "vw.", + "vz.", + "vzngr.", + "vzr.", + "w.", + "w.a.", + "w.b.r.", + "w.c.h.", + "w.conf.huw.", + "w.conf.huwelijksb.", + "w.consum.kr.", + "w.f.r.", + "w.g.", + "w.gew.r.", + "w.ident.pl.", + "w.just.doc.", + "w.kh.", + "w.l.r.", + "w.l.v.", + "w.mil.straf.spr.", + "w.n.", + "w.not.ambt.", + "w.o.", + "w.o.d.huurcomm.", + "w.o.d.k.", + "w.openb.manif.", + "w.parl.", + "w.r.", + "w.reg.", + "w.succ.", + "w.u.b.", + "w.uitv.pl.verord.", + "w.v.", + "w.v.k.", + "w.v.m.s.", + "w.v.r.", + "w.v.w.", + "w.venn.", + "wac.", + "wd.", + "wetb.", + "n.v.h.", + "wgb.", + "winkelt.w.", + "wisk.", + "wka-verkl.", + "wnd.", + "won.w.", + "woningw.", + "woonr.w.", + "wrr.", + "wrr.ber.", + "wrsch.", + "ws.", + "wsch.", + "wsr.", + "wtvb.", + "ww.", + "x.d.", + "z.a.", + "z.g.", + "z.i.", + "z.j.", + "z.o.z.", + "z.p.", + "z.s.m.", + "zg.", + "zgn.", + "zn.", + "znw.", + "zr.", + "zr.", + "ms.", + "zr.ms.", + "'m", + "'n", + "'ns", + "'s", + "'t", +] _exc = {} for orth in abbrevs: From e8be15e9b79ba66497b59947c31604b48793bde0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:18:23 +0200 Subject: [PATCH 033/131] Improve tokenization for UD Spanish AnCora (#5253) --- spacy/lang/es/__init__.py | 3 ++ spacy/lang/es/punctuation.py | 48 +++++++++++++++++++++++++++ spacy/lang/es/tokenizer_exceptions.py | 4 ++- 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/es/punctuation.py diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 80cc1727c..249748a17 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -6,6 +6,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults): ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py new file mode 100644 index 000000000..42335237c --- /dev/null +++ b/spacy/lang/es/punctuation.py @@ -0,0 +1,48 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES +from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA +from ..char_classes import merge_chars +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES + + +_list_units = [u for u in LIST_UNITS if u != "%"] +_units = merge_chars(" ".join(_list_units)) +_concat_quotes = CONCAT_QUOTES + "—–" + + +_suffixes = ( + ["—", "–"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 9109d658b..2c2631086 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -43,14 +43,16 @@ for orth in [ "Av.", "Avda.", "Cía.", + "EE.UU.", "etc.", + "fig.", "Gob.", "Gral.", "Ing.", "J.C.", + "km/h", "Lic.", "m.n.", - "no.", "núm.", "P.D.", "Prof.", From c981aa66849f0e19688be746f8ecbe344e7578b7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:19:04 +0200 Subject: [PATCH 034/131] Use inline flags in token_match patterns (#5257) * Use inline flags in token_match patterns Use inline flags in `token_match` patterns so that serializing does not lose the flag information. * Modify inline flag * Modify inline flag --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tokenizer.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index dfcb2756e..cb1702300 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE + "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 385afb8bd..29ce75442 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -58,7 +58,7 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match +TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..62b8bbf4a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -567,7 +567,7 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): From fa760010a556bb15c76bb6a9bf77b6439de3adf0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 7 Apr 2020 12:04:51 +0200 Subject: [PATCH 035/131] Set rank for new vector in Vocab.set_vector (#5266) Set `Lexeme.rank` for vectors added with `Vocab.set_vector` so that the lexeme `ID` accessed by a model points the right row for the new vector. --- spacy/vocab.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3cf0095ee..8f95c567c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -406,9 +406,9 @@ cdef class Vocab: else: width = self.vectors.shape[1] self.vectors.resize((new_rows, width)) - lex = self[orth] # Adds words to vocab - self.vectors.add(orth, vector=vector) - self.vectors.add(orth, vector=vector) + lex = self[orth] # Add word to vocab if necessary + row = self.vectors.add(orth, vector=vector) + lex.rank = row def has_vector(self, orth): """Check whether a word has a vector. Returns False if no vectors have From 7ad0fcf01dd2ebc9fbf9ce5963d16f1f71d2b572 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 8 Apr 2020 12:58:09 +0200 Subject: [PATCH 036/131] fix json (#5267) --- website/meta/universe.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index bbd67e8a6..b5e1dbde0 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -14,8 +14,7 @@ "from whatlies.language import SpacyLanguage", "", "lang = SpacyLanguage('en_core_web_md')", - "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', ', - 'king', 'queen', 'doctor', 'nurse']", + "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', 'king', 'queen', 'doctor', 'nurse']", "", "emb = lang[words]", "emb.plot_interactive(x_axis='man', y_axis='woman')" From ae4af52ce7dd9dda0eb0f1b8eeb0cba7d20facdf Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Apr 2020 12:58:39 +0200 Subject: [PATCH 037/131] Add ideographic stops to sentencizer (#5263) Add ideographic half- and fullwidth full stops to default sentencizer punctuation. --- spacy/pipeline/pipes.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a20c9b6df..f2a86d56e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1444,7 +1444,8 @@ class Sentencizer(object): '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', - '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈'] + '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', + '。', '。'] def __init__(self, punct_chars=None, **kwargs): """Initialize the sentencizer. From cf579a398d121617c0ab684d414af5f067677078 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Apr 2020 20:03:06 +0200 Subject: [PATCH 038/131] Add __init__.py to eu and hy tests (#5278) --- spacy/tests/lang/eu/__init__.py | 0 spacy/tests/lang/hy/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 spacy/tests/lang/eu/__init__.py create mode 100644 spacy/tests/lang/hy/__init__.py diff --git a/spacy/tests/lang/eu/__init__.py b/spacy/tests/lang/eu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/hy/__init__.py b/spacy/tests/lang/hy/__init__.py new file mode 100644 index 000000000..e69de29bb From 8952effcc43b1694a4c0377904667a45b4ed1318 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Thu, 9 Apr 2020 23:46:15 +1000 Subject: [PATCH 039/131] Fixed Typo in Warning (#5284) * Fixed typo in cli warning Fixed a typo in the warning for the provision of exactly two labels, which have not been designated as binary, to textcat. * Create and signed contributor form --- .github/contributors/umarbutler.md | 106 +++++++++++++++++++++++++++++ spacy/cli/train.py | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/umarbutler.md diff --git a/.github/contributors/umarbutler.md b/.github/contributors/umarbutler.md new file mode 100644 index 000000000..8df825152 --- /dev/null +++ b/.github/contributors/umarbutler.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Umar Butler | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-09 | +| GitHub username | umarbutler | +| Website (optional) | https://umarbutler.com | diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c94c26b62..8fc475d24 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -363,7 +363,7 @@ def train( if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " - "exclusive classes, provide '--textcat_positive_label' for " + "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class." ) msg.text( From 6a8a52650fcb3108f534480651c56e83b0f608fd Mon Sep 17 00:00:00 2001 From: Marek Grzenkowicz Date: Sat, 11 Apr 2020 23:35:01 +0200 Subject: [PATCH 040/131] [Closes #5292] Fix typo in option name "--n-save_every" (#5293) * Sign contributor agreement for chopeen * Fix typo in option name and close #5292 --- .github/contributors/chopeen.md | 106 ++++++++++++++++++++++++++++++++ website/docs/api/cli.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/chopeen.md diff --git a/.github/contributors/chopeen.md b/.github/contributors/chopeen.md new file mode 100644 index 000000000..d293c9845 --- /dev/null +++ b/.github/contributors/chopeen.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Marek Grzenkowicz | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020.04.10 | +| GitHub username | chopeen | +| Website (optional) | | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f067ba5a7..7101e3ddc 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -457,7 +457,7 @@ improvement. $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] -[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save_every] +[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every] [--init-tok2vec] [--epoch-start] ``` From a3965ec13da0b470cc45dfa12708a9eb327d6a94 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 14 Apr 2020 14:53:47 +0200 Subject: [PATCH 041/131] tag-map-path since 2.2.4 instead of 2.2.3 (#5289) --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7101e3ddc..15691c4f8 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -189,7 +189,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi | `lang` | positional | Model language. | | `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | | `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | | `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | | `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | From 8ce408d2e1b48ae6bea96934496bcd7507f4d75e Mon Sep 17 00:00:00 2001 From: Paolo Arduin Date: Tue, 14 Apr 2020 19:14:15 +0200 Subject: [PATCH 042/131] Comparison predicate handling for `!=` (#5282) * Fix #5281 * Optim test --- .github/contributors/paoloq.md | 106 ++++++++++++++++++++++++ spacy/matcher/matcher.pyx | 1 + spacy/tests/matcher/test_matcher_api.py | 19 ++++- 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 .github/contributors/paoloq.md diff --git a/.github/contributors/paoloq.md b/.github/contributors/paoloq.md new file mode 100644 index 000000000..84b28c8ef --- /dev/null +++ b/.github/contributors/paoloq.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paolo Arduin | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 9 April 2020 | +| GitHub username | paoloq | +| Website (optional) | | diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 11461afb8..43480b46e 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -782,6 +782,7 @@ def _get_extra_predicates(spec, extra_predicates): "IN": _SetMemberPredicate, "NOT_IN": _SetMemberPredicate, "==": _ComparisonPredicate, + "!=": _ComparisonPredicate, ">=": _ComparisonPredicate, "<=": _ComparisonPredicate, ">": _ComparisonPredicate, diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c0314f3c3..2e5e64aac 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -265,14 +265,25 @@ def test_matcher_regex_shape(en_vocab): assert len(matches) == 0 -def test_matcher_compare_length(en_vocab): +@pytest.mark.parametrize( + "cmp, bad", + [ + ("==", ["a", "aaa"]), + ("!=", ["aa"]), + (">=", ["a"]), + ("<=", ["aaa"]), + (">", ["a", "aa"]), + ("<", ["aa", "aaa"]) + ] +) +def test_matcher_compare_length(en_vocab, cmp, bad): matcher = Matcher(en_vocab) - pattern = [{"LENGTH": {">=": 2}}] + pattern = [{"LENGTH": {cmp: 2}}] matcher.add("LENGTH_COMPARE", [pattern]) doc = Doc(en_vocab, words=["a", "aa", "aaa"]) matches = matcher(doc) - assert len(matches) == 2 - doc = Doc(en_vocab, words=["a"]) + assert len(matches) == len(doc) - len(bad) + doc = Doc(en_vocab, words=bad) matches = matcher(doc) assert len(matches) == 0 From 3d2c308906e2bde7ca57d2e8213252530b944502 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 14 Apr 2020 19:15:52 +0200 Subject: [PATCH 043/131] Add Doc init from list of words and text (#5251) * Add Doc init from list of words and text Add an option to initialize a `Doc` from a text and list of words where the words may or may not include all whitespace tokens. If the text and words are mismatched, raise an error. * Fix error code * Remove all whitespace before aligning words/text * Move words/text init to util function * Update error message * Rename to get_words_and_spaces * Fix formatting --- spacy/errors.py | 1 + spacy/tests/doc/test_creation.py | 39 ++++++++++++++++++++++++++++++++ spacy/util.py | 30 ++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index e0ddc86c5..ce26e63a4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -555,6 +555,7 @@ class Errors(object): E193 = ("Unable to resize vectors in place if the resized vector dimension " "({new_dim}) is not the same as the current vector dimension " "({curr_dim}).") + E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") @add_codes diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 120fb6e28..8f543e86a 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -6,6 +6,7 @@ from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups +from spacy import util @pytest.fixture @@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab): assert doc[0].lemma_ == "dog" assert doc[1].text == "dogses" assert doc[1].lemma_ == "dogses" + + +def test_create_from_words_and_text(vocab): + # no whitespace in words + words = ["'", "dogs", "'", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # partial whitespace in words + words = [" ", "'", "dogs", "'", "\n\n", "run", " "] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # non-standard whitespace tokens + words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # mismatch between words and text + with pytest.raises(ValueError): + words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words + ["away"], text) diff --git a/spacy/util.py b/spacy/util.py index 9b96b2f5e..706fe303d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs): return exclude +def get_words_and_spaces(words, text): + if "".join("".join(words).split())!= "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + text_words = [] + text_spaces = [] + text_pos = 0 + # normalize words to remove all whitespace tokens + norm_words = [word for word in words if not word.isspace()] + # align words with text + for word in norm_words: + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) + if word_start > 0: + text_words.append(text[text_pos:text_pos+word_start]) + text_spaces.append(False) + text_pos += word_start + text_words.append(word) + text_spaces.append(False) + text_pos += len(word) + if text_pos < len(text) and text[text_pos] == " ": + text_spaces[-1] = True + text_pos += 1 + if text_pos < len(text): + text_words.append(text[text_pos:]) + text_spaces.append(False) + return (text_words, text_spaces) + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty From 98c59027ed12131272b0aa46cdd89e378a13944b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 15 Apr 2020 13:49:47 +0200 Subject: [PATCH 044/131] Use max(uint64) for OOV lexeme rank (#5303) * Use max(uint64) for OOV lexeme rank * Add test for default OOV rank * Revert back to thinc==7.4.0 Requiring the updated version of thinc was unnecessary. * Define OOV_RANK in one place Define OOV_RANK in one place in `util`. * Fix formatting [ci skip] * Switch to external definitions of max(uint64) Switch to external defintions of max(uint64) and confirm that they are equal. --- spacy/_ml.py | 2 +- spacy/cli/init_model.py | 4 ++-- spacy/lexeme.pxd | 1 + spacy/lexeme.pyx | 3 +++ spacy/tests/vocab_vectors/test_lexeme.py | 9 +++++++++ spacy/util.py | 2 ++ spacy/vocab.pyx | 6 +++--- 7 files changed, 21 insertions(+), 6 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index ee7e59218..2a758accc 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -289,7 +289,7 @@ def link_vectors_to_models(vocab): if word.orth in vectors.key2row: word.rank = vectors.key2row[word.orth] else: - word.rank = 0 + word.rank = util.OOV_RANK data = ops.asarray(vectors.data) # Set an entry here, so that vectors are accessed by StaticVectors # (unideal, I know) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3fa0cc890..0bdd4000e 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -16,7 +16,7 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning -from ..util import ensure_path, get_lang_class +from ..util import ensure_path, get_lang_class, OOV_RANK try: import ftfy @@ -148,7 +148,7 @@ def create_model(lang, lex_attrs, name=None): lang_class = get_lang_class(lang) nlp = lang_class() for lexeme in nlp.vocab: - lexeme.rank = 0 + lexeme.rank = OOV_RANK lex_added = 0 for attrs in lex_attrs: if "settings" in attrs: diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 048f8016e..f31733374 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -10,6 +10,7 @@ from numpy cimport ndarray cdef LexemeC EMPTY_LEXEME +cdef attr_t OOV_RANK cdef class Lexeme: cdef LexemeC* c diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5c981bc25..21644e37b 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -11,6 +11,7 @@ np.import_array() import numpy from thinc.neural.util import get_array_module +from libc.stdint cimport UINT64_MAX from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP @@ -21,7 +22,9 @@ from .attrs import intify_attrs from .errors import Errors, Warnings, user_warning +OOV_RANK = UINT64_MAX memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) +EMPTY_LEXEME.id = OOV_RANK cdef class Lexeme: diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d84a56981..b57c6705a 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import pytest +import numpy from spacy.attrs import IS_ALPHA, IS_DIGIT +from spacy.util import OOV_RANK @pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)]) @@ -69,3 +71,10 @@ def test_lexeme_bytes_roundtrip(en_vocab): assert one.orth == alpha.orth assert one.lower == alpha.lower assert one.lower_ == alpha.lower_ + + +def test_vocab_lexeme_oov_rank(en_vocab): + """Test that default rank is OOV_RANK.""" + lex = en_vocab["word"] + assert OOV_RANK == numpy.iinfo(numpy.uint64).max + assert lex.rank == OOV_RANK diff --git a/spacy/util.py b/spacy/util.py index 706fe303d..1c627af46 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -12,6 +12,7 @@ from thinc.neural.ops import NumpyOps import functools import itertools import numpy.random +import numpy import srsly import catalogue import sys @@ -34,6 +35,7 @@ from .errors import Errors, Warnings, deprecation_warning _data_path = Path(__file__).parent / "data" _PRINT_ENV = False +OOV_RANK = numpy.iinfo(numpy.uint64).max class registry(object): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8f95c567c..0f3223025 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -7,7 +7,7 @@ import srsly from collections import OrderedDict from thinc.neural.util import get_array_module -from .lexeme cimport EMPTY_LEXEME +from .lexeme cimport EMPTY_LEXEME, OOV_RANK from .lexeme cimport Lexeme from .typedefs cimport attr_t from .tokens.token cimport Token @@ -165,9 +165,9 @@ cdef class Vocab: lex.orth = self.strings.add(string) lex.length = len(string) if self.vectors is not None: - lex.id = self.vectors.key2row.get(lex.orth, 0) + lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK) else: - lex.id = 0 + lex.id = OOV_RANK if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) From 1eef60c658e4aaa7b3ddb4dab2dac170ceea2c2c Mon Sep 17 00:00:00 2001 From: Thomas Thiebaud Date: Wed, 15 Apr 2020 13:50:46 +0200 Subject: [PATCH 045/131] Add spacy_fastlang to universe (#5271) * Add spacy_fastlang to universe * Sign SCA --- .github/contributors/thomasthiebaud.md | 106 +++++++++++++++++++++++++ website/meta/universe.json | 24 ++++++ 2 files changed, 130 insertions(+) create mode 100644 .github/contributors/thomasthiebaud.md diff --git a/.github/contributors/thomasthiebaud.md b/.github/contributors/thomasthiebaud.md new file mode 100644 index 000000000..bdbf0ec50 --- /dev/null +++ b/.github/contributors/thomasthiebaud.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, + object code, patch, tool, sample, graphic, specification, manual, + documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and + registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment + to any third party, you hereby grant to us a perpetual, irrevocable, + non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your + contribution. The rights that you grant to us under these terms are effective + on the date you first submitted a contribution to us, even if your submission + took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + - Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + - to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + - each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable + U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT + mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | --------------- | +| Name | Thomas Thiebaud | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-07 | +| GitHub username | thomasthiebaud | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index b5e1dbde0..8da96a026 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2093,6 +2093,30 @@ "predict_output = clf.predict(predict_input)" ], "category": ["standalone"] + }, + { + "id": "spacy_fastlang", + "title": "Spacy FastLang", + "slogan": "Language detection done fast", + "description": "Fast language detection using FastText and Spacy.", + "github": "thomasthiebaud/spacy-fastlang", + "pip": "spacy_fastlang", + "code_example": [ + "import spacy", + "from spacy_fastlang import LanguageDetector", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe(LanguageDetector())", + "doc = nlp('Life is like a box of chocolates. You never know what you're gonna get.')", + "", + "assert doc._.language == 'en'", + "assert doc._.language_score >= 0.8" + ], + "author": "Thomas Thiebaud", + "author_links": { + "github": "thomasthiebaud" + }, + "category": ["pipeline"] } ], From 1ca32d8f9c800eb36e912dc1fa7b173edf7f2c3c Mon Sep 17 00:00:00 2001 From: Paolo Arduin Date: Wed, 15 Apr 2020 13:51:33 +0200 Subject: [PATCH 046/131] Matcher support for Span as well as Doc (#5113) * Matcher support for Span, as well as Doc #5056 * Removes an import unused * Signed contributors agreement * Code optimization and better test * Add error message for bad Matcher call argument * Fix merging --- .github/contributors/paoloq.md | 2 +- spacy/errors.py | 1 + spacy/matcher/matcher.pyx | 36 ++++++++++++++----------- spacy/tests/matcher/test_matcher_api.py | 11 +++++++- 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/.github/contributors/paoloq.md b/.github/contributors/paoloq.md index 84b28c8ef..0fac70c9a 100644 --- a/.github/contributors/paoloq.md +++ b/.github/contributors/paoloq.md @@ -5,7 +5,7 @@ This spaCy Contributor Agreement (**"SCA"**) is based on the The SCA applies to any contribution that you make to any product or project managed by us (the **"project"**), and sets out the intellectual property rights you grant to us in the contributed materials. The term **"us"** shall mean -[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +[ExplosionAI GmbH](https://explosion.ai/legal). The term **"you"** shall mean the person or entity identified below. If you agree to be bound by these terms, fill in the information requested diff --git a/spacy/errors.py b/spacy/errors.py index ce26e63a4..b1cdb89ec 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -556,6 +556,7 @@ class Errors(object): "({new_dim}) is not the same as the current vector dimension " "({curr_dim}).") E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") + E195 = ("Matcher can be called on {good} only, got {got}.") @add_codes diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 43480b46e..9e0fe2812 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -14,6 +14,7 @@ from ..typedefs cimport attr_t from ..structs cimport TokenC from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr +from ..tokens.span cimport Span from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA @@ -211,22 +212,29 @@ cdef class Matcher: else: yield doc - def __call__(self, Doc doc): + def __call__(self, object doc_or_span): """Find all token sequences matching the supplied pattern. - doc (Doc): The document to match over. + doc_or_span (Doc or Span): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ + if isinstance(doc_or_span, Doc): + doc = doc_or_span + length = len(doc) + elif isinstance(doc_or_span, Span): + doc = doc_or_span.doc + length = doc_or_span.end - doc_or_span.start + else: + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E156.format()) - matches = find_matches(&self.patterns[0], self.patterns.size(), doc, - extensions=self._extensions, - predicates=self._extra_predicates) + matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, + extensions=self._extensions, predicates=self._extra_predicates) for i, (key, start, end) in enumerate(matches): on_match = self._callbacks.get(key, None) if on_match is not None: @@ -248,9 +256,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher - -cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, - predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples. @@ -268,18 +274,18 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, cdef int i, j, nr_extra_attr cdef Pool mem = Pool() output = [] - if doc.length == 0: + if length == 0: # avoid any processing or mem alloc if the document is empty return output if len(predicates) > 0: - predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char)) + predicate_cache = mem.alloc(length * len(predicates), sizeof(char)) if extensions is not None and len(extensions) >= 1: nr_extra_attr = max(extensions.values()) + 1 - extra_attr_values = mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) + extra_attr_values = mem.alloc(length * nr_extra_attr, sizeof(attr_t)) else: nr_extra_attr = 0 - extra_attr_values = mem.alloc(doc.length, sizeof(attr_t)) - for i, token in enumerate(doc): + extra_attr_values = mem.alloc(length, sizeof(attr_t)) + for i, token in enumerate(doc_or_span): for name, index in extensions.items(): value = token._.get(name) if isinstance(value, basestring): @@ -287,11 +293,11 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, extra_attr_values[i * nr_extra_attr + index] = value # Main loop cdef int nr_predicate = len(predicates) - for i in range(doc.length): + for i in range(length): for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, predicate_cache, - doc[i], extra_attr_values, predicates) + doc_or_span[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 2e5e64aac..0295ada82 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,7 +6,6 @@ import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token - from ..doc.test_underscore import clean_underscore # noqa: F401 @@ -470,3 +469,13 @@ def test_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) + + +def test_matcher_span(matcher): + text = "JavaScript is good but Java is better" + doc = Doc(matcher.vocab, words=text.split()) + span_js = doc[:3] + span_java = doc[4:] + assert len(matcher(doc)) == 2 + assert len(matcher(span_js)) == 1 + assert len(matcher(span_java)) == 1 From dac70f29eb3b1f21ae9e2c6346666bf6a46307b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Harinck?= Date: Thu, 16 Apr 2020 11:32:09 +0200 Subject: [PATCH 047/131] contrib: add contributor agreement for user sebastienharinck (#5316) --- .github/contributors/sebastienharinck.md | 106 +++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/sebastienharinck.md diff --git a/.github/contributors/sebastienharinck.md b/.github/contributors/sebastienharinck.md new file mode 100644 index 000000000..e0fddeba5 --- /dev/null +++ b/.github/contributors/sebastienharinck.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------------------- | +| Name | Sébastien Harinck | +| Company name (if applicable) | Odaxiom | +| Title or role (if applicable) | ML Engineer | +| Date | 2020-04-15 | +| GitHub username | sebastienharinck | +| Website (optional) | [https://odaxiom.com](https://odaxiom.com) | \ No newline at end of file From 663333c3b2bad90915d1a48a626ca1275b7ef077 Mon Sep 17 00:00:00 2001 From: Jakob Jul Elben Date: Thu, 16 Apr 2020 13:29:02 +0200 Subject: [PATCH 048/131] Fixes #5413 (#5315) * Fix 5314 * Add contributor * Resolve requested changes Co-authored-by: Jakob Jul Elben --- .github/contributors/elben10 | 106 ++++++++++++++++++ .../wikipedia_processor.py | 6 +- spacy/tests/regression/test_issue5314.py | 18 +++ 3 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/elben10 create mode 100644 spacy/tests/regression/test_issue5314.py diff --git a/.github/contributors/elben10 b/.github/contributors/elben10 new file mode 100644 index 000000000..1eb4656dc --- /dev/null +++ b/.github/contributors/elben10 @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jakob Jul Elben | +| Company name (if applicable) | N/A | +| Title or role (if applicable) | N/A | +| Date | April 16th, 2020 | +| GitHub username | elben10 | +| Website (optional) | N/A | diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index ed3c35c43..649d48fe5 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -30,7 +30,8 @@ logger = logging.getLogger(__name__) title_regex = re.compile(r"(?<=).*(?=)") id_regex = re.compile(r"(?<=)\d*(?=)") -text_regex = re.compile(r"(?<=).*(?=)") +text_regex = re.compile(r"(?<=).*(?=[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" +new_format_text = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" +potential_future_format = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" + + +@pytest.mark.parametrize( + "text", [old_format_text, new_format_text, potential_future_format] +) +def test_issue5314(text): + title = "Arkæologi" + clean_text, _ = _process_wp_text(title, text, {}) + + expected_text = "Arkæologi er studiet af tidligere tiders menneskelige aktivitet, primært gennem studiet af menneskets materielle levn." + assert clean_text.strip() == expected_text From 068146d4ca2506a5d9a9f60ec8ad7e983d554ff9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 16 Apr 2020 14:45:25 +0200 Subject: [PATCH 049/131] Update netlify.toml [ci skip] --- netlify.toml | 62 ++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/netlify.toml b/netlify.toml index 45bd2c3b6..be809f1d4 100644 --- a/netlify.toml +++ b/netlify.toml @@ -7,42 +7,42 @@ redirects = [ {from = "https://alpha.spacy.io/*", to = "https://spacy.io", force = true}, {from = "http://alpha.spacy.io/*", to = "https://spacy.io", force = true}, # Old demos - {from = "/demos/*", to = "https://explosion.ai/demos/:splat"}, + {from = "/demos/*", to = "https://explosion.ai/demos/:splat", force = true}, # Old blog - {from = "/blog/*", to = "https://explosion.ai/blog/:splat"}, - {from = "/feed", to = "https://explosion.ai/feed"}, - {from = "/feed.xml", to = "https://explosion.ai/feed"}, + {from = "/blog/*", to = "https://explosion.ai/blog/:splat", force = true}, + {from = "/feed", to = "https://explosion.ai/feed", force = true}, + {from = "/feed.xml", to = "https://explosion.ai/feed", force = true}, # Old documentation pages (1.x) - {from = "/docs/usage/processing-text", to = "/usage/linguistic-features"}, - {from = "/docs/usage/deep-learning", to = "/usage/training"}, - {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging"}, - {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse"}, - {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities"}, - {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity"}, - {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization"}, - {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines"}, - {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines"}, - {from = "/docs/usage/training-ner", to = "/usage/training#ner"}, - {from = "/docs/usage/tutorials", to = "/usage/examples"}, - {from = "/docs/usage/data-model", to = "/api"}, - {from = "/docs/usage/cli", to = "/api/cli"}, - {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, - {from = "/docs/api/language-models", to = "/usage/models#languages"}, - {from = "/docs/api/spacy", to = "/docs/api/top-level"}, - {from = "/docs/api/displacy", to = "/api/top-level#displacy"}, - {from = "/docs/api/util", to = "/api/top-level#util"}, - {from = "/docs/api/features", to = "/models/#architecture"}, - {from = "/docs/api/philosophy", to = "/usage/spacy-101"}, - {from = "/docs/usage/showcase", to = "/universe"}, - {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom"}, - {from = "/tutorials", to = "/usage/examples"}, + {from = "/docs/usage/processing-text", to = "/usage/linguistic-features", force = true}, + {from = "/docs/usage/deep-learning", to = "/usage/training", force = true}, + {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging", force = true}, + {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse", force = true}, + {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities", force = true}, + {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity", force = true}, + {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true}, + {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true}, + {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true}, + {from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true}, + {from = "/docs/usage/tutorials", to = "/usage/examples", force = true}, + {from = "/docs/usage/data-model", to = "/api", force = true}, + {from = "/docs/usage/cli", to = "/api/cli", force = true}, + {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true}, + {from = "/docs/api/language-models", to = "/usage/models#languages", force = true}, + {from = "/docs/api/spacy", to = "/docs/api/top-level", force = true}, + {from = "/docs/api/displacy", to = "/api/top-level#displacy", force = true}, + {from = "/docs/api/util", to = "/api/top-level#util", force = true}, + {from = "/docs/api/features", to = "/models/#architecture", force = true}, + {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true}, + {from = "/docs/usage/showcase", to = "/universe", force = true}, + {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true}, + {from = "/tutorials", to = "/usage/examples", force = true}, # Rewrite all other docs pages to / {from = "/docs/*", to = "/:splat"}, # Updated documentation pages - {from = "/usage/resources", to = "/universe"}, - {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, - {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"}, - {from = "/models/comparison", to = "/models"}, + {from = "/usage/resources", to = "/universe", force = true}, + {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true}, + {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching", force = true}, + {from = "/models/comparison", to = "/models", force = true}, {from = "/api/#section-cython", to = "/api/cython", force = true}, {from = "/api/#cython", to = "/api/cython", force = true}, {from = "/api/sentencesegmenter", to="/api/sentencizer"}, From fb73d4943a91d18cd36ded98994a932515f4bf05 Mon Sep 17 00:00:00 2001 From: laszabine Date: Thu, 16 Apr 2020 20:00:18 +0200 Subject: [PATCH 050/131] Amend documentation to Language.evaluate (#5319) * Specified usage of arguments to Language.evaluate * Created contributor agreement --- .github/contributors/laszabine.md | 106 ++++++++++++++++++++++++++++++ website/docs/api/language.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/laszabine.md diff --git a/.github/contributors/laszabine.md b/.github/contributors/laszabine.md new file mode 100644 index 000000000..c1a4a3a6b --- /dev/null +++ b/.github/contributors/laszabine.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sabine Laszakovits | +| Company name (if applicable) | Austrian Academy of Sciences | +| Title or role (if applicable) | Data analyst | +| Date | 2020-04-16 | +| GitHub username | laszabine | +| Website (optional) | https://sabine.laszakovits.net | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d548a1f64..97dfbf100 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -136,7 +136,7 @@ Evaluate a model's pipeline components. | Name | Type | Description | | -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects or `(text, annotations)` of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). | +| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects, such that the `Doc` objects contain the predictions and the `GoldParse` objects the correct annotations. Alternatively, `(text, annotations)` tuples of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). | | `verbose` | bool | Print debugging information. | | `batch_size` | int | The batch size to use. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | From f7471abd82c1cbf6dc42f299eb0237a174f86da5 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 18 Apr 2020 17:01:53 +0200 Subject: [PATCH 051/131] Add pkuseg and serialization support for Chinese (#5308) * Add pkuseg and serialization support for Chinese Add support for pkuseg alongside jieba * Specify model through `Language` meta: * split on characters (if no word segmentation packages are installed) ``` Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": False}}}) ``` * jieba (remains the default tokenizer if installed) ``` Chinese() Chinese(meta={"tokenizer": {"config": {"use_jieba": True}}}) # explicit ``` * pkuseg ``` Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}}}) ``` * The new tokenizer setting `require_pkuseg` is used to override `use_jieba` default, which is intended for models that provide a pkuseg model: ``` nlp_pkuseg = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "default", "require_pkuseg": True}}}) nlp = Chinese() # has `use_jieba` as `True` by default nlp.from_bytes(nlp_pkuseg.to_bytes()) # `require_pkuseg` overrides `use_jieba` when calling the tokenizer ``` Add support for serialization of tokenizer settings and pkuseg model, if loaded * Add sorting for `Language.to_bytes()` serialization of `Language.meta` so that the (emptied, but still present) tokenizer metadata is in a consistent position in the serialized data Extend tests to cover all three tokenizer configurations and serialization * Fix from_disk and tests without jieba or pkuseg * Load cfg first and only show error if `use_pkuseg` * Fix blank/default initialization in serialization tests * Explicitly initialize jieba's cache on init * Add serialization for pkuseg pre/postprocessors * Reformat pkuseg install message --- spacy/lang/zh/__init__.py | 297 ++++++++++++++++++++++---- spacy/language.py | 2 +- spacy/tests/conftest.py | 16 +- spacy/tests/lang/zh/test_serialize.py | 38 ++++ spacy/tests/lang/zh/test_text.py | 4 +- spacy/tests/lang/zh/test_tokenizer.py | 34 ++- 6 files changed, 329 insertions(+), 62 deletions(-) create mode 100644 spacy/tests/lang/zh/test_serialize.py diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8179b4551..2cf00d389 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,6 +1,10 @@ # coding: utf8 from __future__ import unicode_literals +import tempfile +import srsly +from pathlib import Path +from collections import OrderedDict from ...attrs import LANG from ...language import Language from ...tokens import Doc @@ -9,12 +13,19 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from .tag_map import TAG_MAP +from ... import util + + +_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" def try_jieba_import(use_jieba): try: import jieba + # segment a short text to have jieba initialize its cache in advance + list(jieba.cut("作为", cut_all=False)) + return jieba except ImportError: if use_jieba: @@ -25,59 +36,241 @@ def try_jieba_import(use_jieba): raise ImportError(msg) +def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict): + try: + import pkuseg + + if pkuseg_model: + return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) + elif use_pkuseg: + msg = ( + "Chinese.use_pkuseg is True but no pkuseg model was specified. " + "Please provide the name of a pretrained model " + "or the path to a model with " + '`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).' + ) + raise ValueError(msg) + except ImportError: + if use_pkuseg: + msg = ( + "pkuseg not installed. Either set Chinese.use_pkuseg = False, " + "or " + _PKUSEG_INSTALL_MSG + ) + raise ImportError(msg) + except FileNotFoundError: + if use_pkuseg: + msg = "Unable to load pkuseg model from: " + pkuseg_model + raise FileNotFoundError(msg) + + class ChineseTokenizer(DummyTokenizer): - def __init__(self, cls, nlp=None): + def __init__(self, cls, nlp=None, config={}): + self.use_jieba = config.get("use_jieba", cls.use_jieba) + self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg) + self.require_pkuseg = config.get("require_pkuseg", False) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.use_jieba = cls.use_jieba self.jieba_seg = try_jieba_import(self.use_jieba) + self.pkuseg_seg = try_pkuseg_import( + self.use_pkuseg, + pkuseg_model=config.get("pkuseg_model", None), + pkuseg_user_dict=config.get("pkuseg_user_dict", "default"), + ) + # remove relevant settings from config so they're not also saved in + # Language.meta + for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]: + if key in config: + del config[key] self.tokenizer = Language.Defaults().create_tokenizer(nlp) def __call__(self, text): - # use jieba - if self.use_jieba: - jieba_words = list( - [x for x in self.jieba_seg.cut(text, cut_all=False) if x] - ) - words = [jieba_words[0]] - spaces = [False] - for i in range(1, len(jieba_words)): - word = jieba_words[i] - if word.isspace(): - # second token in adjacent whitespace following a - # non-space token - if spaces[-1]: - words.append(word) - spaces.append(False) - # first space token following non-space token - elif word == " " and not words[-1].isspace(): - spaces[-1] = True - # token is non-space whitespace or any whitespace following - # a whitespace token - else: - # extend previous whitespace token with more whitespace - if words[-1].isspace(): - words[-1] += word - # otherwise it's a new whitespace token - else: - words.append(word) - spaces.append(False) - else: - words.append(word) - spaces.append(False) + use_jieba = self.use_jieba + use_pkuseg = self.use_pkuseg + if self.require_pkuseg: + use_jieba = False + use_pkuseg = True + if use_jieba: + words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) + (words, spaces) = util.get_words_and_spaces(words, text) + return Doc(self.vocab, words=words, spaces=spaces) + elif use_pkuseg: + words = self.pkuseg_seg.cut(text) + (words, spaces) = util.get_words_and_spaces(words, text) + return Doc(self.vocab, words=words, spaces=spaces) + else: + # split into individual characters + words = list(text) + (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) - # split into individual characters - words = [] - spaces = [] - for token in self.tokenizer(text): - if token.text.isspace(): - words.append(token.text) - spaces.append(False) - else: - words.extend(list(token.text)) - spaces.extend([False] * len(token.text)) - spaces[-1] = bool(token.whitespace_) - return Doc(self.vocab, words=words, spaces=spaces) + def _get_config(self): + config = OrderedDict( + ( + ("use_jieba", self.use_jieba), + ("use_pkuseg", self.use_pkuseg), + ("require_pkuseg", self.require_pkuseg), + ) + ) + return config + + def _set_config(self, config={}): + self.use_jieba = config.get("use_jieba", False) + self.use_pkuseg = config.get("use_pkuseg", False) + self.require_pkuseg = config.get("require_pkuseg", False) + + def to_bytes(self, **kwargs): + pkuseg_features_b = b"" + pkuseg_weights_b = b"" + pkuseg_processors_data = None + if self.pkuseg_seg: + with tempfile.TemporaryDirectory() as tempdir: + self.pkuseg_seg.feature_extractor.save(tempdir) + self.pkuseg_seg.model.save(tempdir) + tempdir = Path(tempdir) + with open(tempdir / "features.pkl", "rb") as fileh: + pkuseg_features_b = fileh.read() + with open(tempdir / "weights.npz", "rb") as fileh: + pkuseg_weights_b = fileh.read() + pkuseg_processors_data = ( + _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie), + self.pkuseg_seg.postprocesser.do_process, + sorted(list(self.pkuseg_seg.postprocesser.common_words)), + sorted(list(self.pkuseg_seg.postprocesser.other_words)), + ) + serializers = OrderedDict( + ( + ("cfg", lambda: srsly.json_dumps(self._get_config())), + ("pkuseg_features", lambda: pkuseg_features_b), + ("pkuseg_weights", lambda: pkuseg_weights_b), + ( + "pkuseg_processors", + lambda: srsly.msgpack_dumps(pkuseg_processors_data), + ), + ) + ) + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + pkuseg_features_b = b"" + pkuseg_weights_b = b"" + pkuseg_processors_data = None + + def deserialize_pkuseg_features(b): + nonlocal pkuseg_features_b + pkuseg_features_b = b + + def deserialize_pkuseg_weights(b): + nonlocal pkuseg_weights_b + pkuseg_weights_b = b + + def deserialize_pkuseg_processors(b): + nonlocal pkuseg_processors_data + pkuseg_processors_data = srsly.msgpack_loads(b) + + deserializers = OrderedDict( + ( + ("cfg", lambda b: self._set_config(srsly.json_loads(b))), + ("pkuseg_features", deserialize_pkuseg_features), + ("pkuseg_weights", deserialize_pkuseg_weights), + ("pkuseg_processors", deserialize_pkuseg_processors), + ) + ) + util.from_bytes(data, deserializers, []) + + if pkuseg_features_b and pkuseg_weights_b: + with tempfile.TemporaryDirectory() as tempdir: + tempdir = Path(tempdir) + with open(tempdir / "features.pkl", "wb") as fileh: + fileh.write(pkuseg_features_b) + with open(tempdir / "weights.npz", "wb") as fileh: + fileh.write(pkuseg_weights_b) + try: + import pkuseg + except ImportError: + raise ImportError( + "pkuseg not installed. To use this model, " + + _PKUSEG_INSTALL_MSG + ) + self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) + if pkuseg_processors_data: + ( + user_dict, + do_process, + common_words, + other_words, + ) = pkuseg_processors_data + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.postprocesser.do_process = do_process + self.pkuseg_seg.postprocesser.common_words = set(common_words) + self.pkuseg_seg.postprocesser.other_words = set(other_words) + + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + + def save_pkuseg_model(path): + if self.pkuseg_seg: + if not path.exists(): + path.mkdir(parents=True) + self.pkuseg_seg.model.save(path) + self.pkuseg_seg.feature_extractor.save(path) + + def save_pkuseg_processors(path): + if self.pkuseg_seg: + data = ( + _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie), + self.pkuseg_seg.postprocesser.do_process, + sorted(list(self.pkuseg_seg.postprocesser.common_words)), + sorted(list(self.pkuseg_seg.postprocesser.other_words)), + ) + srsly.write_msgpack(path, data) + + serializers = OrderedDict( + ( + ("cfg", lambda p: srsly.write_json(p, self._get_config())), + ("pkuseg_model", lambda p: save_pkuseg_model(p)), + ("pkuseg_processors", lambda p: save_pkuseg_processors(p)), + ) + ) + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + + def load_pkuseg_model(path): + try: + import pkuseg + except ImportError: + if self.use_pkuseg: + raise ImportError( + "pkuseg not installed. To use this model, " + + _PKUSEG_INSTALL_MSG + ) + if path.exists(): + self.pkuseg_seg = pkuseg.pkuseg(path) + + def load_pkuseg_processors(path): + try: + import pkuseg + except ImportError: + if self.use_pkuseg: + raise ImportError(self._pkuseg_install_msg) + if self.pkuseg_seg: + data = srsly.read_msgpack(path) + (user_dict, do_process, common_words, other_words) = data + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.postprocesser.do_process = do_process + self.pkuseg_seg.postprocesser.common_words = set(common_words) + self.pkuseg_seg.postprocesser.other_words = set(other_words) + + serializers = OrderedDict( + ( + ("cfg", lambda p: self._set_config(srsly.read_json(p))), + ("pkuseg_model", lambda p: load_pkuseg_model(p)), + ("pkuseg_processors", lambda p: load_pkuseg_processors(p)), + ) + ) + util.from_disk(path, serializers, []) class ChineseDefaults(Language.Defaults): @@ -89,10 +282,11 @@ class ChineseDefaults(Language.Defaults): tag_map = TAG_MAP writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} use_jieba = True + use_pkuseg = False @classmethod - def create_tokenizer(cls, nlp=None): - return ChineseTokenizer(cls, nlp) + def create_tokenizer(cls, nlp=None, config={}): + return ChineseTokenizer(cls, nlp, config=config) class Chinese(Language): @@ -103,4 +297,13 @@ class Chinese(Language): return self.tokenizer(text) +def _get_pkuseg_trie_data(node, path=""): + data = [] + for c, child_node in sorted(node.children.items()): + data.extend(_get_pkuseg_trie_data(child_node, path + c)) + if node.isword: + data.append((path, node.usertag)) + return data + + __all__ = ["Chinese"] diff --git a/spacy/language.py b/spacy/language.py index 56619080d..f5eff2ae9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -969,7 +969,7 @@ class Language(object): serializers = OrderedDict() serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) - serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) + serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items()))) for name, proc in self.pipeline: if name in exclude: continue diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 43c3152a0..0f14f0a27 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -231,10 +231,22 @@ def yo_tokenizer(): @pytest.fixture(scope="session") -def zh_tokenizer(): +def zh_tokenizer_char(): + return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False}) + + +@pytest.fixture(scope="session") +def zh_tokenizer_jieba(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() + +@pytest.fixture(scope="session") +def zh_tokenizer_pkuseg(): + pytest.importorskip("pkuseg") + return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}) + + @pytest.fixture(scope="session") def hy_tokenizer(): - return get_lang_class("hy").Defaults.create_tokenizer() \ No newline at end of file + return get_lang_class("hy").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py new file mode 100644 index 000000000..58133a88e --- /dev/null +++ b/spacy/tests/lang/zh/test_serialize.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.zh import Chinese +from ...util import make_tempdir + + +def zh_tokenizer_serialize(zh_tokenizer): + tokenizer_bytes = zh_tokenizer.to_bytes() + nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + zh_tokenizer.to_disk(file_path) + nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_zh_tokenizer_serialize_char(zh_tokenizer_char): + zh_tokenizer_serialize(zh_tokenizer_char) + + +def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): + zh_tokenizer_serialize(zh_tokenizer_jieba) + + +def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg): + zh_tokenizer_serialize(zh_tokenizer_pkuseg) + + +@pytest.mark.slow +def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): + nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}}) + zh_tokenizer_serialize(nlp.tokenizer) diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py index 235f597a5..3a3ccbdde 100644 --- a/spacy/tests/lang/zh/test_text.py +++ b/spacy/tests/lang/zh/test_text.py @@ -19,7 +19,7 @@ import pytest (",", False), ], ) -def test_lex_attrs_like_number(zh_tokenizer, text, match): - tokens = zh_tokenizer(text) +def test_lex_attrs_like_number(zh_tokenizer_jieba, text, match): + tokens = zh_tokenizer_jieba(text) assert len(tokens) == 1 assert tokens[0].like_num == match diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 36d94beb5..bff7b1ed1 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -5,27 +5,41 @@ import pytest # fmt: off -TOKENIZER_TESTS = [ - ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。", +TEXTS = ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",) +JIEBA_TOKENIZER_TESTS = [ + (TEXTS[0], ['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多', '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做', '为', '母语', '。']), ] +PKUSEG_TOKENIZER_TESTS = [ + (TEXTS[0], + ['作为', '语言', '而言', ',', '为', '世界', '使用', '人数', '最多', + '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做为', + '母语', '。']), +] # fmt: on -@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) -def test_zh_tokenizer(zh_tokenizer, text, expected_tokens): - zh_tokenizer.use_jieba = False - tokens = [token.text for token in zh_tokenizer(text)] +@pytest.mark.parametrize("text", TEXTS) +def test_zh_tokenizer_char(zh_tokenizer_char, text): + tokens = [token.text for token in zh_tokenizer_char(text)] assert tokens == list(text) - zh_tokenizer.use_jieba = True - tokens = [token.text for token in zh_tokenizer(text)] + +@pytest.mark.parametrize("text,expected_tokens", JIEBA_TOKENIZER_TESTS) +def test_zh_tokenizer_jieba(zh_tokenizer_jieba, text, expected_tokens): + tokens = [token.text for token in zh_tokenizer_jieba(text)] assert tokens == expected_tokens -def test_extra_spaces(zh_tokenizer): +@pytest.mark.parametrize("text,expected_tokens", PKUSEG_TOKENIZER_TESTS) +def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens): + tokens = [token.text for token in zh_tokenizer_pkuseg(text)] + assert tokens == expected_tokens + + +def test_extra_spaces(zh_tokenizer_char): # note: three spaces after "I" - tokens = zh_tokenizer("I like cheese.") + tokens = zh_tokenizer_char("I like cheese.") assert tokens[1].orth_ == " " From b919844fce1fd3b02e69ff2f3d6cc786b12f74b0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 20 Apr 2020 20:33:13 +0200 Subject: [PATCH 052/131] Tidy up and fix alignment of landing cards (#5317) --- website/src/components/landing.js | 13 ++++++-- website/src/styles/landing.module.sass | 5 +++ website/src/widgets/landing.js | 44 +++++++++++--------------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/website/src/components/landing.js b/website/src/components/landing.js index 16c342e3f..fb03d2845 100644 --- a/website/src/components/landing.js +++ b/website/src/components/landing.js @@ -46,10 +46,17 @@ export const LandingGrid = ({ cols = 3, blocks = false, children }) => ( export const LandingCol = ({ children }) =>
{children}
-export const LandingCard = ({ title, children }) => ( +export const LandingCard = ({ title, button, url, children }) => (
- {title &&

{title}

} - {children} +
+ {title &&

{title}

} +

{children}

+
+ {button && url && ( +
+ {button} +
+ )}
) diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass index d7340229b..e36e36c0a 100644 --- a/website/src/styles/landing.module.sass +++ b/website/src/styles/landing.module.sass @@ -49,12 +49,17 @@ margin-bottom: -25rem .card + display: flex + flex-direction: column padding: 3rem 2.5rem background: var(--color-back) border-radius: var(--border-radius) box-shadow: var(--box-shadow) margin-bottom: 3rem +.card-text + flex: 100% + .button width: 100% diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 2dc5d40dc..9aeec0cdc 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -79,34 +79,28 @@ const Landing = ({ data }) => { in Python - -

- spaCy is designed to help you do real work — to build real products, or - gather real insights. The library respects your time, and tries to avoid - wasting it. It's easy to install, and its API is simple and productive. We - like to think of spaCy as the Ruby on Rails of Natural Language Processing. -

- Get started + + spaCy is designed to help you do real work — to build real products, or gather + real insights. The library respects your time, and tries to avoid wasting it. + It's easy to install, and its API is simple and productive. We like to think of + spaCy as the Ruby on Rails of Natural Language Processing. - -

- spaCy excels at large-scale information extraction tasks. It's written from - the ground up in carefully memory-managed Cython. Independent research in - 2015 found spaCy to be the fastest in the world. If your application needs - to process entire web dumps, spaCy is the library you want to be using. -

- Facts & Figures + + spaCy excels at large-scale information extraction tasks. It's written from the + ground up in carefully memory-managed Cython. Independent research in 2015 found + spaCy to be the fastest in the world. If your application needs to process + entire web dumps, spaCy is the library you want to be using. - -

- spaCy is the best way to prepare text for deep learning. It interoperates - seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of - Python's awesome AI ecosystem. With spaCy, you can easily construct - linguistically sophisticated statistical models for a variety of NLP - problems. -

- Read more + + spaCy is the best way to prepare text for deep learning. It interoperates + seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of + Python's awesome AI ecosystem. With spaCy, you can easily construct + linguistically sophisticated statistical models for a variety of NLP problems.
From bf5c13d17021540bd30fbbb1c251984b5d8f1fc0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 20 Apr 2020 22:06:53 +0200 Subject: [PATCH 053/131] Modify jieba install message (#5328) Modify jieba install message to instruct the user to use `ChineseDefaults.use_jieba = False` so that it's possible to load pkuseg-only models without jieba installed. --- spacy/lang/zh/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 2cf00d389..701e696a4 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -30,8 +30,10 @@ def try_jieba_import(use_jieba): except ImportError: if use_jieba: msg = ( - "Jieba not installed. Either set Chinese.use_jieba = False, " - "or install it https://github.com/fxsjy/jieba" + "Jieba not installed. Either set the default to False with " + "`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, " + "or install it with `pip install jieba` or from " + "https://github.com/fxsjy/jieba" ) raise ImportError(msg) From 521f3610527998e3ccbd7591f1df95e66ed56350 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 21 Apr 2020 19:31:03 +0200 Subject: [PATCH 054/131] Switch to new gold.align method (#5334) * Switch from original `_align` to new simpler alignment algorithm from #4526 * Remove alignment normalizations beyond whitespace and lowercasing --- setup.py | 1 - spacy/_align.pyx | 255 -------------------------------------- spacy/gold.pyx | 54 +------- spacy/tests/test_align.py | 79 ------------ spacy/tests/test_gold.py | 3 +- 5 files changed, 2 insertions(+), 390 deletions(-) delete mode 100644 spacy/_align.pyx delete mode 100644 spacy/tests/test_align.py diff --git a/setup.py b/setup.py index 1156e7cde..62a09aa73 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ PACKAGES = find_packages() MOD_NAMES = [ - "spacy._align", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", diff --git a/spacy/_align.pyx b/spacy/_align.pyx deleted file mode 100644 index 6786ec7ba..000000000 --- a/spacy/_align.pyx +++ /dev/null @@ -1,255 +0,0 @@ -# cython: infer_types=True -'''Do Levenshtein alignment, for evaluation of tokenized input. - -Random notes: - - r i n g - 0 1 2 3 4 -r 1 0 1 2 3 -a 2 1 1 2 3 -n 3 2 2 1 2 -g 4 3 3 2 1 - -0,0: (1,1)=min(0+0,1+1,1+1)=0 S -1,0: (2,1)=min(1+1,0+1,2+1)=1 D -2,0: (3,1)=min(2+1,3+1,1+1)=2 D -3,0: (4,1)=min(3+1,4+1,2+1)=3 D -0,1: (1,2)=min(1+1,2+1,0+1)=1 D -1,1: (2,2)=min(0+1,1+1,1+1)=1 S -2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I -3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I -0,2: (1,3)=min(2+1,3+1,1+1)=2 I -1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I -2,2: (3,3) -3,2: (4,3) -At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?" - -We know the costs to transition: - -S[:i] -> T[:j] (at D[i,j]) -S[:i+1] -> T[:j] (at D[i+1,j]) -S[:i] -> T[:j+1] (at D[i,j+1]) - -Further, now we can transform: -S[:i+1] -> S[:i] (DEL) for 1, -T[:j+1] -> T[:j] (INS) for 1. -S[i+1] -> T[j+1] (SUB) for 0 or 1 - -Therefore we have the costs: -SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j]) -i.e. D[i, j] + S[i+1] != T[j+1] -INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j]) -i.e. D[i+1,j] + 1 -DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) -i.e. D[i,j+1] + 1 - - Source string S has length m, with index i - Target string T has length n, with index j - - Output two alignment vectors: i2j (length m) and j2i (length n) - # function LevenshteinDistance(char s[1..m], char t[1..n]): - # for all i and j, d[i,j] will hold the Levenshtein distance between - # the first i characters of s and the first j characters of t - # note that d has (m+1)*(n+1) values - # set each element in d to zero - ring rang - - r i n g - - 0 0 0 0 0 - r 0 0 0 0 0 - a 0 0 0 0 0 - n 0 0 0 0 0 - g 0 0 0 0 0 - - # source prefixes can be transformed into empty string by - # dropping all characters - # d[i, 0] := i - ring rang - - r i n g - - 0 0 0 0 0 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - - # target prefixes can be reached from empty source prefix - # by inserting every character - # d[0, j] := j - - r i n g - - 0 1 2 3 4 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - -''' -from __future__ import unicode_literals -from libc.stdint cimport uint32_t -import numpy -cimport numpy as np -from .compat import unicode_ -from murmurhash.mrmr cimport hash32 - - -def align(S, T): - cdef int m = len(S) - cdef int n = len(T) - cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') - cdef np.ndarray i2j = numpy.zeros((m,), dtype='i') - cdef np.ndarray j2i = numpy.zeros((n,), dtype='i') - - cdef np.ndarray S_arr = _convert_sequence(S) - cdef np.ndarray T_arr = _convert_sequence(T) - - fill_matrix(matrix.data, - S_arr.data, m, T_arr.data, n) - fill_i2j(i2j, matrix) - fill_j2i(j2i, matrix) - for i in range(i2j.shape[0]): - if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): - i2j[i] = -1 - for j in range(j2i.shape[0]): - if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): - j2i[j] = -1 - return matrix[-1,-1], i2j, j2i, matrix - - -def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): - '''Let's say we had: - - Guess: [aa bb cc dd] - Truth: [aa bbcc dd] - i2j: [0, None, -2, 2] - j2i: [0, -2, 3] - - We want: - - i2j_multi: {1: 1, 2: 1} - j2i_multi: {} - ''' - i2j_miss = _get_regions(i2j, i_lengths) - j2i_miss = _get_regions(j2i, j_lengths) - - i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) - return i2j_multi, j2i_multi - - -def _get_regions(alignment, lengths): - regions = {} - start = None - offset = 0 - for i in range(len(alignment)): - if alignment[i] < 0: - if start is None: - start = offset - regions.setdefault(start, []) - regions[start].append(i) - else: - start = None - offset += lengths[i] - return regions - - -def _get_mapping(miss1, miss2, lengths1, lengths2): - i2j = {} - j2i = {} - for start, region1 in miss1.items(): - if not region1 or start not in miss2: - continue - region2 = miss2[start] - if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2): - j = region2.pop(0) - buff = [] - # Consume tokens from region 1, until we meet the length of the - # first token in region2. If we do, align the tokens. If - # we exceed the length, break. - while region1: - buff.append(region1.pop(0)) - if sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - j += 1 - buff = [] - elif sum(lengths1[i] for i in buff) > lengths2[j]: - break - else: - if buff and sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - return i2j, j2i - - -def _convert_sequence(seq): - if isinstance(seq, numpy.ndarray): - return numpy.ascontiguousarray(seq, dtype='uint32_t') - cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32') - cdef bytes item_bytes - for i, item in enumerate(seq): - if item == "``": - item = '"' - elif item == "''": - item = '"' - if isinstance(item, unicode): - item_bytes = item.encode('utf8') - else: - item_bytes = item - output[i] = hash32(item_bytes, len(item_bytes), 0) - return output - - -cdef void fill_matrix(int* D, - const int* S, int m, const int* T, int n) nogil: - m1 = m+1 - n1 = n+1 - for i in range(m1*n1): - D[i] = 0 - - for i in range(m1): - D[i*n1] = i - - for j in range(n1): - D[j] = j - - cdef int sub_cost, ins_cost, del_cost - for j in range(n): - for i in range(m): - i_j = i*n1 + j - i1_j1 = (i+1)*n1 + j+1 - i1_j = (i+1)*n1 + j - i_j1 = i*n1 + j+1 - if S[i] != T[j]: - sub_cost = D[i_j] + 1 - else: - sub_cost = D[i_j] - del_cost = D[i_j1] + 1 - ins_cost = D[i1_j] + 1 - best = min(min(sub_cost, ins_cost), del_cost) - D[i1_j1] = best - - -cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *: - j = D.shape[1]-2 - cdef int i = D.shape[0]-2 - while i >= 0: - while D[i+1, j] < D[i+1, j+1]: - j -= 1 - if D[i, j+1] < D[i+1, j+1]: - i2j[i] = -1 - else: - i2j[i] = j - j -= 1 - i -= 1 - -cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *: - i = D.shape[0]-2 - cdef int j = D.shape[1]-2 - while j >= 0: - while D[i, j+1] < D[i+1, j+1]: - i -= 1 - if D[i+1, j] < D[i+1, j+1]: - j2i[j] = -1 - else: - j2i[j] = i - i -= 1 - j -= 1 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 07fd3bdd0..a41f06898 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -21,7 +21,6 @@ from .util import minibatch, itershuffle from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek -USE_NEW_ALIGN = False punct_re = re.compile(r"\W") @@ -73,57 +72,8 @@ def merge_sents(sents): return [(m_deps, (m_cats, m_brackets))] -_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] - - def _normalize_for_alignment(tokens): - tokens = [w.replace(" ", "").lower() for w in tokens] - output = [] - for token in tokens: - token = token.replace(" ", "").lower() - for before, after in _ALIGNMENT_NORM_MAP: - token = token.replace(before, after) - output.append(token) - return output - - -def _align_before_v2_2_2(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations, using the Levenshtein - algorithm. The alignment is case-insensitive. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - from . import _align - if tokens_a == tokens_b: - alignment = numpy.arange(len(tokens_a)) - return 0, alignment, alignment, {}, {} - tokens_a = [w.replace(" ", "").lower() for w in tokens_a] - tokens_b = [w.replace(" ", "").lower() for w in tokens_b] - cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], - [len(w) for w in tokens_b]) - for i, j in list(i2j_multi.items()): - if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: - i2j[i] = j - i2j_multi.pop(i) - for j, i in list(j2i_multi.items()): - if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: - j2i[j] = i - j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi + return [w.replace(" ", "").lower() for w in tokens] def align(tokens_a, tokens_b): @@ -144,8 +94,6 @@ def align(tokens_a, tokens_b): * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other direction. """ - if not USE_NEW_ALIGN: - return _align_before_v2_2_2(tokens_a, tokens_b) tokens_a = _normalize_for_alignment(tokens_a) tokens_b = _normalize_for_alignment(tokens_b) cost = 0 diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py deleted file mode 100644 index d6bbab04e..000000000 --- a/spacy/tests/test_align.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy._align import align, multi_align - - -@pytest.mark.parametrize( - "string1,string2,cost", - [ - ("hello", "hell", 1), - ("rat", "cat", 1), - ("rat", "rat", 0), - ("rat", "catsie", 4), - ("t", "catsie", 5), - ], -) -def test_align_costs(string1, string2, cost): - output_cost, i2j, j2i, matrix = align(string1, string2) - assert output_cost == cost - - -@pytest.mark.parametrize( - "string1,string2,i2j", - [ - ("hello", "hell", [0, 1, 2, 3, -1]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2]), - ("t", "catsie", [2]), - ], -) -def test_align_i2j(string1, string2, i2j): - output_cost, output_i2j, j2i, matrix = align(string1, string2) - assert list(output_i2j) == i2j - - -@pytest.mark.parametrize( - "string1,string2,j2i", - [ - ("hello", "hell", [0, 1, 2, 3]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2, -1, -1, -1]), - ("t", "catsie", [-1, -1, 0, -1, -1, -1]), - ], -) -def test_align_i2j_2(string1, string2, j2i): - output_cost, output_i2j, output_j2i, matrix = align(string1, string2) - assert list(output_j2i) == j2i - - -def test_align_strings(): - words1 = ["hello", "this", "is", "test!"] - words2 = ["hellothis", "is", "test", "!"] - cost, i2j, j2i, matrix = align(words1, words2) - assert cost == 4 - assert list(i2j) == [-1, -1, 1, -1] - assert list(j2i) == [-1, 2, -1, -1] - - -def test_align_many_to_one(): - words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] - words2 = ["ab", "bc", "e", "fg", "h"] - cost, i2j, j2i, matrix = align(words1, words2) - assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] - lengths1 = [len(w) for w in words1] - lengths2 = [len(w) for w in words2] - i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2) - assert i2j_multi[0] == 0 - assert i2j_multi[1] == 0 - assert i2j_multi[2] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[5] == 3 - assert i2j_multi[6] == 3 - - assert j2i_multi[0] == 1 - assert j2i_multi[1] == 3 diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index fbdb3155b..b546e079b 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -177,13 +177,12 @@ def test_roundtrip_docs_to_json(): assert cats["BAKING"] == goldparse.cats["BAKING"] -@pytest.mark.skip(reason="skip while we have backwards-compatible alignment") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), ( - ["a", "b", "``", "c"], + ["a", "b", '"', "c"], ['ab"', "c"], (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ), From 481574cbc865211e35faf6e36f5ece203ee59e60 Mon Sep 17 00:00:00 2001 From: Mike <34043825+Mlawrence95@users.noreply.github.com> Date: Tue, 21 Apr 2020 11:35:12 -0700 Subject: [PATCH 055/131] [minor doc change] embedding vis. link is broken in `website/docs/usage/examples.md` (#5325) * The embedding vis. link is broken The first link seems to be reasonable for now unless someone has an updated embedding vis they want to share? * contributor agreement * Update Mlawrence95.md * Update website/docs/usage/examples.md Co-Authored-By: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- .github/contributors/Mlawrence95.md | 106 ++++++++++++++++++++++++++++ website/docs/usage/examples.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/Mlawrence95.md diff --git a/.github/contributors/Mlawrence95.md b/.github/contributors/Mlawrence95.md new file mode 100644 index 000000000..505d6c16f --- /dev/null +++ b/.github/contributors/Mlawrence95.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ x ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mike Lawrence | +| Company name (if applicable) | NA | +| Title or role (if applicable) | NA | +| Date | April 17, 2020 | +| GitHub username | Mlawrence95 | +| Website (optional) | | diff --git a/website/docs/usage/examples.md b/website/docs/usage/examples.md index 180b02ff4..96dc7627d 100644 --- a/website/docs/usage/examples.md +++ b/website/docs/usage/examples.md @@ -162,7 +162,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.p This script lets you load any spaCy model containing word vectors into [TensorBoard](https://projector.tensorflow.org/) to create an -[embedding visualization](https://www.tensorflow.org/versions/r1.1/get_started/embedding_viz). +[embedding visualization](https://github.com/tensorflow/tensorboard/blob/master/docs/tensorboard_projector_plugin.ipynb). ```python https://github.com/explosion/spaCy/tree/master/examples/vectors_tensorboard.py From 84e06f9fb767910011ffeff69d5895ac6eeebf23 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 23 Apr 2020 16:58:23 +0200 Subject: [PATCH 056/131] Improve GoldParse NER alignment (#5335) Improve GoldParse NER alignment by including all cases where the start and end of the NER span can be aligned, regardless of internal tokenization differences. To do this, convert BILUO tags to character offsets, check start/end alignment with `doc.char_span()`, and assign the BILUO tags for the aligned spans. Alignment for `O/-` tags is handled through the one-to-one and multi alignments. --- spacy/errors.py | 2 + spacy/gold.pyx | 86 ++++++++++++++++++++++++++++------------ spacy/tests/test_gold.py | 70 ++++++++++++++++++++++++++++++++ spacy/util.py | 2 +- 4 files changed, 133 insertions(+), 27 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b1cdb89ec..e52241be1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -110,6 +110,8 @@ class Warnings(object): W028 = ("Doc.from_array was called with a vector of type '{type}', " "but is expecting one of type 'uint64' instead. This may result " "in problems with the vocab further on in the pipeline.") + W029 = ("Unable to align tokens with entities from character offsets. " + "Discarding entity annotation for the text: {text}.") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index a41f06898..8b61de683 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -648,6 +648,9 @@ cdef class GoldParse: # if self.lenght > 0, this is modified latter. self.orig_annot = [] + # temporary doc for aligning entity annotation + entdoc = None + # avoid allocating memory if the doc does not contain any tokens if self.length > 0: if words is None: @@ -670,7 +673,25 @@ cdef class GoldParse: entities = [(ent if ent is not None else "-") for ent in entities] if not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. - entities = biluo_tags_from_offsets(doc, entities) + # Create a temporary Doc corresponding to provided words + # (to preserve gold tokenization) and text (to preserve + # character offsets). + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + entdoc_entities = biluo_tags_from_offsets(entdoc, entities) + # There may be some additional whitespace tokens in the + # temporary doc, so check that the annotations align with + # the provided words while building a list of BILUO labels. + entities = [] + words_offset = 0 + for i in range(len(entdoc_words)): + if words[i + words_offset] == entdoc_words[i]: + entities.append(entdoc_entities[i]) + else: + words_offset -= 1 + if len(entities) != len(words): + user_warning(Warnings.W029.format(text=doc.text)) + entities = ["-" for _ in words] # These are filled by the tagger/parser/entity recogniser self.c.tags = self.mem.alloc(len(doc), sizeof(int)) @@ -697,7 +718,8 @@ cdef class GoldParse: # If we under-segment, we'll have one predicted word that covers a # sequence of gold words. # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that. + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] @@ -720,7 +742,6 @@ cdef class GoldParse: self.tags[i] = tags[i2j_multi[i]] self.morphology[i] = morphology[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) - is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last if not is_last: self.heads[i] = i+1 @@ -730,30 +751,10 @@ cdef class GoldParse: if head_i: self.heads[i] = self.gold_to_cand[head_i] self.labels[i] = deps[i2j_multi[i]] - # Now set NER...This is annoying because if we've split - # got an entity word split into two, we need to adjust the - # BILUO tags. We can't have BB or LL etc. - # Case 1: O -- easy. ner_tag = entities[i2j_multi[i]] - if ner_tag == "O": - self.ner[i] = "O" - # Case 2: U. This has to become a B I* L sequence. - elif ner_tag.startswith("U-"): - if is_first: - self.ner[i] = ner_tag.replace("U-", "B-", 1) - elif is_last: - self.ner[i] = ner_tag.replace("U-", "L-", 1) - else: - self.ner[i] = ner_tag.replace("U-", "I-", 1) - # Case 3: L. If not last, change to I. - elif ner_tag.startswith("L-"): - if is_last: - self.ner[i] = ner_tag - else: - self.ner[i] = ner_tag.replace("L-", "I-", 1) - # Case 4: I. Stays correct - elif ner_tag.startswith("I-"): - self.ner[i] = ner_tag + # Assign O/- for many-to-one O/- NER tags + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] @@ -764,6 +765,39 @@ cdef class GoldParse: self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] + # Assign O/- for one-to-many O/- NER tags + for j, cand_j in enumerate(self.gold_to_cand): + if cand_j is None: + if j in j2i_multi: + i = j2i_multi[j] + ner_tag = entities[j] + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + + # If there is entity annotation and some tokens remain unaligned, + # align all entities at the character level to account for all + # possible token misalignments within the entity spans + if any([e not in ("O", "-") for e in entities]) and None in self.ner: + # If the temporary entdoc wasn't created above, initialize it + if not entdoc: + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + # Get offsets based on gold words and BILUO entities + entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) + aligned_offsets = [] + aligned_spans = [] + # Filter offsets to identify those that align with doc tokens + for offset in entdoc_offsets: + span = doc.char_span(offset[0], offset[1]) + if span and not span.text.isspace(): + aligned_offsets.append(offset) + aligned_spans.append(span) + # Convert back to BILUO for doc tokens and assign NER for all + # aligned spans + biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) + for span in aligned_spans: + for i in range(span.start, span.end): + self.ner[i] = biluo_tags[i] # Prevent whitespace that isn't within entities from being tagged as # an entity. diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index b546e079b..fc9e624eb 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -6,6 +6,7 @@ from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English from spacy.tokens import Doc +from spacy.util import get_words_and_spaces from .util import make_tempdir import pytest import srsly @@ -59,6 +60,75 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): + # one-to-many + words = ["I", "flew to", "San Francisco Valley", "."] + spaces = [True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, + words=["I", "flew", "to", "San", "Francisco", "Valley", "."], + entities=entities, + ) + assert gp.ner == ["O", "O", "U-LOC", "O"] + + # many-to-one + words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + spaces = [True, True, True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities + ) + assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + + # misaligned + words = ["I flew", "to", "San Francisco", "Valley", "."] + spaces = [True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, words=["I", "flew to", "San", "Francisco Valley", "."], entities=entities, + ) + assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"] + + # additional whitespace tokens in GoldParse words + words, spaces = get_words_and_spaces( + ["I", "flew", "to", "San Francisco", "Valley", "."], + "I flew to San Francisco Valley.", + ) + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, + words=["I", "flew", " ", "to", "San Francisco Valley", "."], + entities=entities, + ) + assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] + + # from issue #4791 + data = ( + "I'll return the ₹54 amount", + { + "words": ["I", "'ll", "return", "the", "₹", "54", "amount",], + "entities": [(16, 19, "MONEY")], + }, + ) + gp = GoldParse(en_tokenizer(data[0]), **data[1]) + assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"] + + data = ( + "I'll return the $54 amount", + { + "words": ["I", "'ll", "return", "the", "$", "54", "amount",], + "entities": [(16, 19, "MONEY")], + }, + ) + gp = GoldParse(en_tokenizer(data[0]), **data[1]) + assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"] + + def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] diff --git a/spacy/util.py b/spacy/util.py index 1c627af46..a5e27a210 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -758,7 +758,7 @@ def get_serialization_exclude(serializers, exclude, kwargs): def get_words_and_spaces(words, text): - if "".join("".join(words).split())!= "".join(text.split()): + if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) text_words = [] text_spaces = [] From fc91660aa289f4e8f2e809a8179e13aa55799afd Mon Sep 17 00:00:00 2001 From: sabiqueqb Date: Mon, 27 Apr 2020 13:15:08 +0530 Subject: [PATCH 057/131] Gh 5339 language class for malayalam (#5342) * Initialize Malayalam Language class * Add lex_attrs and examples for Malayalam * Add spaCy Contributor Agreement * Add test for ml tokenizer --- .github/contributors/sabiqueqb.md | 106 ++++++++++++++++++++++++++++++ spacy/lang/ml/__init__.py | 18 +++++ spacy/lang/ml/examples.py | 19 ++++++ spacy/lang/ml/lex_attrs.py | 80 ++++++++++++++++++++++ spacy/lang/ml/stop_words.py | 18 +++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/ml/test_text.py | 16 +++++ 7 files changed, 262 insertions(+) create mode 100644 .github/contributors/sabiqueqb.md create mode 100644 spacy/lang/ml/__init__.py create mode 100644 spacy/lang/ml/examples.py create mode 100644 spacy/lang/ml/lex_attrs.py create mode 100644 spacy/lang/ml/stop_words.py create mode 100644 spacy/tests/lang/ml/test_text.py diff --git a/.github/contributors/sabiqueqb.md b/.github/contributors/sabiqueqb.md new file mode 100644 index 000000000..da0f2f2a2 --- /dev/null +++ b/.github/contributors/sabiqueqb.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sabique Ahammed Lava | +| Company name (if applicable) | QBurst | +| Title or role (if applicable) | Senior Engineer | +| Date | 24 Apr 2020 | +| GitHub username | sabiqueqb | +| Website (optional) | | diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py new file mode 100644 index 000000000..d052ded1b --- /dev/null +++ b/spacy/lang/ml/__init__.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + +from ...language import Language + + +class MalayalamDefaults(Language.Defaults): + stop_words = STOP_WORDS + + +class Malayalam(Language): + lang = "ml" + Defaults = MalayalamDefaults + + +__all__ = ["Malayalam"] diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py new file mode 100644 index 000000000..a2a0ed10e --- /dev/null +++ b/spacy/lang/ml/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ml.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക", + "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി", + "എന്താണ്‌ കവാടങ്ങൾ?", + "ചുരുക്കത്തിൽ വിക്കിപീഡിയയുടെ ഉള്ളടക്കത്തിലേക്കുള്ള പടിപ്പുരകളാണ്‌‌ കവാടങ്ങൾ. അവ ലളിതവും വായനക്കാരനെ ആകർഷിക്കുന്നതുമായിരിക്കും", + "പതിനൊന്നുപേർ വീതമുള്ള രണ്ടു ടീമുകൾ കളിക്കുന്ന സംഘകായിക വിനോദമാണു ക്രിക്കറ്റ്", +] diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py new file mode 100644 index 000000000..345da8126 --- /dev/null +++ b/spacy/lang/ml/lex_attrs.py @@ -0,0 +1,80 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +# reference 2: https://www.omniglot.com/language/numbers/malayalam.htm + +_num_words = [ + "പൂജ്യം ", + "ഒന്ന് ", + "രണ്ട് ", + "മൂന്ന് ", + "നാല്‌ ", + "അഞ്ച് ", + "ആറ് ", + "ഏഴ് ", + "എട്ട് ", + "ഒന്‍പത് ", + "പത്ത് ", + "പതിനൊന്ന്", + "പന്ത്രണ്ട്", + "പതി മൂന്നു", + "പതിനാല്", + "പതിനഞ്ച്", + "പതിനാറ്", + "പതിനേഴ്", + "പതിനെട്ട്", + "പത്തൊമ്പതു", + "ഇരുപത്", + "ഇരുപത്തിഒന്ന്", + "ഇരുപത്തിരണ്ട്‌", + "ഇരുപത്തിമൂന്ന്", + "ഇരുപത്തിനാല്", + "ഇരുപത്തിഅഞ്ചു", + "ഇരുപത്തിആറ്", + "ഇരുപത്തിഏഴ്", + "ഇരുപത്തിഎട്ടു", + "ഇരുപത്തിഒന്‍പത്", + "മുപ്പത്", + "മുപ്പത്തിഒന്ന്", + "മുപ്പത്തിരണ്ട്", + "മുപ്പത്തിമൂന്ന്", + "മുപ്പത്തിനാല്", + "മുപ്പത്തിഅഞ്ചു", + "മുപ്പത്തിആറ്", + "മുപ്പത്തിഏഴ്", + "മുപ്പത്തിഎട്ട്", + "മുപ്പത്തിഒന്‍പതു", + "നാല്‍പത്‌ ", + "അന്‍പത് ", + "അറുപത് ", + "എഴുപത് ", + "എണ്‍പത് ", + "തൊണ്ണൂറ് ", + "നുറ് ", + "ആയിരം ", + "പത്തുലക്ഷം" +] + + +def like_num(text): + """ + Check if text resembles a number + """ + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py new file mode 100644 index 000000000..4012571bc --- /dev/null +++ b/spacy/lang/ml/stop_words.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set( + + """ +അത് +ഇത് +ആയിരുന്നു +ആകുന്നു +വരെ +അന്നേരം +അന്ന് +ഇന്ന് +ആണ് +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 0f14f0a27..2ba759a29 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -150,6 +150,11 @@ def lt_tokenizer(): return get_lang_class("lt").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ml_tokenizer(): + return get_lang_class("ml").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def nb_tokenizer(): return get_lang_class("nb").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py new file mode 100644 index 000000000..92eca6b21 --- /dev/null +++ b/spacy/tests/lang/ml/test_text.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_ml_tokenizer_handles_long_text(ml_tokenizer): + text = """അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക""" + tokens = ml_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)]) +def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length): + tokens = ml_tokenizer(text) + assert len(tokens) == length From 90c754024f079e0b7842acb826cc253db17c3cb3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 10:53:05 +0200 Subject: [PATCH 058/131] Update nlp.vectors to nlp.vocab.vectors (#5357) --- website/docs/api/vectors.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 93e747c1e..a4c36f8cd 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -111,7 +111,7 @@ Check whether a key has been mapped to a vector entry in the table. > > ```python > cat_id = nlp.vocab.strings["cat"] -> nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) +> nlp.vocab.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) > assert cat_id in vectors > ``` @@ -315,7 +315,7 @@ performed in chunks, to avoid consuming too much memory. You can set the > > ```python > queries = numpy.asarray([numpy.random.uniform(-1, 1, (300,))]) -> most_similar = nlp.vectors.most_similar(queries, n=10) +> most_similar = nlp.vocab.vectors.most_similar(queries, n=10) > ``` | Name | Type | Description | From b2b7e1f37a1c9e9312006b39bfd3051ba83e1750 Mon Sep 17 00:00:00 2001 From: Punitvara Date: Mon, 27 Apr 2020 14:37:37 +0530 Subject: [PATCH 059/131] This PR adds Gujarati Language class along with (#5355) * This PR adds Gujarati Language class along with - stop words * Add test for gu tokenizer --- .github/contributors/punitvara.md | 107 ++++++++++++++++++++++++++++++ spacy/lang/gu/__init__.py | 18 +++++ spacy/lang/gu/examples.py | 22 ++++++ spacy/lang/gu/stop_words.py | 91 +++++++++++++++++++++++++ spacy/tests/conftest.py | 4 ++ spacy/tests/lang/gu/test_text.py | 20 ++++++ 6 files changed, 262 insertions(+) create mode 100644 .github/contributors/punitvara.md create mode 100644 spacy/lang/gu/__init__.py create mode 100644 spacy/lang/gu/examples.py create mode 100644 spacy/lang/gu/stop_words.py create mode 100644 spacy/tests/lang/gu/test_text.py diff --git a/.github/contributors/punitvara.md b/.github/contributors/punitvara.md new file mode 100644 index 000000000..dde810453 --- /dev/null +++ b/.github/contributors/punitvara.md @@ -0,0 +1,107 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Punit Vara | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-26 | +| GitHub username | punitvara | +| Website (optional) | https://punitvara.com | + diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py new file mode 100644 index 000000000..1f080c7c2 --- /dev/null +++ b/spacy/lang/gu/__init__.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + +from ...language import Language + + +class GujaratiDefaults(Language.Defaults): + stop_words = STOP_WORDS + + +class Gujarati(Language): + lang = "gu" + Defaults = GujaratiDefaults + + +__all__ = ["Gujarati"] diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py new file mode 100644 index 000000000..202a8d022 --- /dev/null +++ b/spacy/lang/gu/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.gu.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.", + "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું", + "કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો", + "તેજપાળને બે પત્ની હતી", + "ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો", + "આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.", + "અહિયાં શું જોડાય છે?", + "મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.", +] diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py new file mode 100644 index 000000000..f641b5720 --- /dev/null +++ b/spacy/lang/gu/stop_words.py @@ -0,0 +1,91 @@ +# coding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set( + """ +એમ +આ +એ +રહી +છે +છો +હતા +હતું +હતી +હોય +હતો +શકે +તે +તેના +તેનું +તેને +તેની +તેઓ +તેમને +તેમના +તેમણે +તેમનું +તેમાં +અને +અહીં +થી +થઈ +થાય +જે + ને +કે +ના +ની +નો +ને +નું +શું +માં +પણ +પર +જેવા +જેવું +જાય +જેમ +જેથી +માત્ર +માટે +પરથી +આવ્યું +એવી +આવી +રીતે +સુધી +થાય +થઈ +સાથે +લાગે +હોવા +છતાં +રહેલા +કરી +કરે +કેટલા +કોઈ +કેમ +કર્યો +કર્યુ +કરે +સૌથી +ત્યારબાદ +તથા +દ્વારા +જુઓ +જાઓ +જ્યારે +ત્યારે +શકો +નથી +હવે +અથવા +થતો +દર +એટલો +પરંતુ +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 2ba759a29..e52c5155f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -103,6 +103,10 @@ def ga_tokenizer(): return get_lang_class("ga").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def gu_tokenizer(): + return get_lang_class("gu").Defaults.create_tokenizer() + @pytest.fixture(scope="session") def he_tokenizer(): return get_lang_class("he").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py new file mode 100644 index 000000000..9f3ae45a4 --- /dev/null +++ b/spacy/tests/lang/gu/test_text.py @@ -0,0 +1,20 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_gu_tokenizer_handlers_long_text(gu_tokenizer): + text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે""" + tokens = gu_tokenizer(text) + assert len(tokens) == 9 + +@pytest.mark.parametrize( + "text,length", + [ + ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), + ("ખેતરની ખેડ કરવામાં આવે છે.", 5), + ], +) +def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length): + tokens = gu_tokenizer(text) + assert len(tokens) == length From 9203d821ae798b67d84e42b319a310b876f3dc93 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 27 Apr 2020 13:01:54 +0200 Subject: [PATCH 060/131] Add 2 ini files in tests/lang (#5359) --- spacy/tests/lang/gu/__init__.py | 0 spacy/tests/lang/ml/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 spacy/tests/lang/gu/__init__.py create mode 100644 spacy/tests/lang/ml/__init__.py diff --git a/spacy/tests/lang/gu/__init__.py b/spacy/tests/lang/gu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ml/__init__.py b/spacy/tests/lang/ml/__init__.py new file mode 100644 index 000000000..e69de29bb From f8ac5b9f563050472aedc719950b4888c65ca4cc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 16:51:27 +0200 Subject: [PATCH 061/131] bugfix in span similarity (#5155) (#5358) * bugfix in span similarity * also rewrite doc.pyx for clarity * formatting Co-authored-by: Sofie Van Landeghem --- spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++ spacy/tokens/doc.pyx | 15 ++++++++------- spacy/tokens/span.pyx | 6 ++++-- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 spacy/tests/regression/test_issue5152.py diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py new file mode 100644 index 000000000..a9a57746d --- /dev/null +++ b/spacy/tests/regression/test_issue5152.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + assert span_2.similarity(span_3) < 1.0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ec0cd66b8..f27115e6f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -387,13 +387,14 @@ cdef class Doc: if isinstance(other, (Lexeme, Token)) and self.length == 1: if self.c[0].lex.orth == other.orth: return 1.0 - elif isinstance(other, (Span, Doc)): - if len(self) == len(other): - for i in range(self.length): - if self[i].orth != other[i].orth: - break - else: - return 1.0 + elif isinstance(other, (Span, Doc)) and len(self) == len(other): + similar = True + for i in range(self.length): + if self[i].orth != other[i].orth: + similar = False + break + if similar: + return 1.0 if self.vocab.vectors.n_keys == 0: models_warning(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 35c70f236..9269700b0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -324,11 +324,13 @@ cdef class Span: if len(self) == 1 and hasattr(other, "orth"): if self[0].orth == other.orth: return 1.0 - elif hasattr(other, "__len__") and len(self) == len(other): + elif isinstance(other, (Doc, Span)) and len(self) == len(other): + similar = True for i in range(len(self)): if self[i].orth != getattr(other[i], "orth", None): + similar = False break - else: + if similar: return 1.0 if self.vocab.vectors.n_keys == 0: models_warning(Warnings.W007.format(obj="Span")) From 792aa7b6ab48ad40254102e5730c420e36822a70 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 18:01:12 +0200 Subject: [PATCH 062/131] Remove references to textcat spans (#5360) Remove references to unimplemented `TextCategorizer` span labels in `GoldParse` and `Doc`. --- website/docs/api/doc.md | 2 +- website/docs/api/goldparse.md | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index ab85c1deb..7decc2278 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -653,7 +653,7 @@ The L2 norm of the document's vector representation. | `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `vocab` | `Vocab` | The store of lexical types. | | `tensor` 2 | `ndarray` | Container for dense vector representations. | -| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | +| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | | `lang_` 2.1 | unicode | Language of the document's vocabulary. | diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 1ef6f0362..443913311 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -7,12 +7,10 @@ source: spacy/gold.pyx ## GoldParse.\_\_init\_\_ {#init tag="method"} -Create a `GoldParse`. Unlike annotations in `entities`, label annotations in -`cats` can overlap, i.e. a single word can be covered by multiple labelled -spans. The [`TextCategorizer`](/api/textcategorizer) component expects true -examples of a label to have the value `1.0`, and negative examples of a label to -have the value `0.0`. Labels not in the dictionary are treated as missing – the -gradient for those labels will be zero. +Create a `GoldParse`. The [`TextCategorizer`](/api/textcategorizer) component +expects true examples of a label to have the value `1.0`, and negative examples +of a label to have the value `0.0`. Labels not in the dictionary are treated as +missing – the gradient for those labels will be zero. | Name | Type | Description | | ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -22,8 +20,8 @@ gradient for those labels will be zero. | `heads` | iterable | A sequence of integers, representing syntactic head offsets. | | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | -| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | +| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -53,7 +51,7 @@ Whether the provided syntactic annotations form a projective dependency tree. | `ner` | list | The named entity annotations as BILUO tags. | | `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | | `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | -| `cats` 2 | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. | +| `cats` 2 | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | | `links` 2.2 | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | ## Utilities {#util} From 5b5528ff2edb8aad2c133a1a2473a279a27e8b8a Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 27 Apr 2020 20:02:09 +0000 Subject: [PATCH 063/131] Add `!=3.4.*` to python_requires (#5344) Missed in 80d554f2e2813aea41b0889b39d8f30f648af1ad --- .github/contributors/michael-k.md | 106 ++++++++++++++++++++++++++++++ setup.cfg | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/michael-k.md diff --git a/.github/contributors/michael-k.md b/.github/contributors/michael-k.md new file mode 100644 index 000000000..4ecc5be85 --- /dev/null +++ b/.github/contributors/michael-k.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Michael Käufl | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-23 | +| GitHub username | michael-k | +| Website (optional) | | diff --git a/setup.cfg b/setup.cfg index 465367ff6..722adc0e2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ zip_safe = false include_package_data = true scripts = bin/spacy -python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.* setup_requires = wheel cython>=0.25 From bc39f97e11a150b77f54b36b0e862aee2555380e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Apr 2020 13:37:37 +0200 Subject: [PATCH 064/131] Simplify warnings --- spacy/__init__.py | 4 +- spacy/_ml.py | 7 ++-- spacy/analysis.py | 6 ++- spacy/cli/init_model.py | 4 +- spacy/displacy/__init__.py | 8 ++-- spacy/errors.py | 69 +-------------------------------- spacy/gold.pyx | 7 ++-- spacy/kb.pyx | 12 +++--- spacy/language.py | 15 +++---- spacy/lexeme.pyx | 5 ++- spacy/matcher/matcher.pyx | 5 ++- spacy/matcher/phrasematcher.pyx | 12 +++--- spacy/pipeline/pipes.pyx | 5 ++- spacy/tests/doc/test_doc_api.py | 3 +- spacy/tests/doc/test_span.py | 3 +- spacy/tokenizer.pyx | 7 ++-- spacy/tokens/doc.pyx | 14 +++---- spacy/tokens/span.pyx | 10 ++--- spacy/tokens/token.pyx | 7 ++-- spacy/util.py | 4 +- 20 files changed, 76 insertions(+), 131 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 4a0d16a49..6aa7b7c16 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -13,7 +13,7 @@ from . import pipeline from .cli.info import info as cli_info from .glossary import explain from .about import __version__ -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings from . import util from .util import registry from .language import component @@ -26,7 +26,7 @@ if sys.maxunicode == 65535: def load(name, **overrides): depr_path = overrides.get("path") if depr_path not in (True, False, None): - deprecation_warning(Warnings.W001.format(path=depr_path)) + warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning) return util.load_model(name, **overrides) diff --git a/spacy/_ml.py b/spacy/_ml.py index 2a758accc..5cccabac1 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import numpy +import warnings from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu from thinc.t2t import ExtractWindow, ParametricAttention from thinc.t2v import Pooling, sum_pool, mean_pool @@ -22,7 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed import thinc.extra.load_nlp from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE -from .errors import Errors, user_warning, Warnings +from .errors import Errors, Warnings from . import util from . import ml as new_ml from .ml import _legacy_tok2vec @@ -283,7 +284,7 @@ def link_vectors_to_models(vocab): if vectors.name is None: vectors.name = VECTORS_KEY if vectors.data.size != 0: - user_warning(Warnings.W020.format(shape=vectors.data.shape)) + warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) ops = Model.ops for word in vocab: if word.orth in vectors.key2row: @@ -299,7 +300,7 @@ def link_vectors_to_models(vocab): # This is a hack to avoid the problem in #3853. old_name = vectors.name new_name = vectors.name + "_%d" % data.shape[0] - user_warning(Warnings.W019.format(old=old_name, new=new_name)) + warnings.warn(Warnings.W019.format(old=old_name, new=new_name)) vectors.name = new_name key = (ops.device, vectors.name) thinc.extra.load_nlp.VECTORS[key] = data diff --git a/spacy/analysis.py b/spacy/analysis.py index 761be3de9..960ce6c0f 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -1,11 +1,13 @@ # coding: utf8 from __future__ import unicode_literals +import warnings + from collections import OrderedDict from wasabi import Printer from .tokens import Doc, Token, Span -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings def analyze_pipes(pipeline, name, pipe, index, warn=True): @@ -34,7 +36,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): if not fulfilled: problems.append(annot) if warn: - user_warning(Warnings.W025.format(name=name, attr=annot)) + warnings.warn(Warnings.W025.format(name=name, attr=annot)) return problems diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 0bdd4000e..32d2d974e 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -15,7 +15,7 @@ import srsly from wasabi import msg from ..vectors import Vectors -from ..errors import Errors, Warnings, user_warning +from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class, OOV_RANK try: @@ -246,7 +246,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_clusters(clusters_loc): clusters = {} if ftfy is None: - user_warning(Warnings.W004) + warnings.warn(Warnings.W004) with clusters_loc.open() as f: for line in tqdm(f): try: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 922d80e57..8a6ec2f53 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -10,7 +10,7 @@ from __future__ import unicode_literals from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span from ..compat import b_to_str -from ..errors import Errors, Warnings, user_warning +from ..errors import Errors, Warnings from ..util import is_in_jupyter @@ -89,7 +89,7 @@ def serve( from wsgiref import simple_server if is_in_jupyter(): - user_warning(Warnings.W011) + warnings.warn(Warnings.W011) render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) @@ -119,7 +119,7 @@ def parse_deps(orig_doc, options={}): """ doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) if not doc.is_parsed: - user_warning(Warnings.W005) + warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): with doc.retokenize() as retokenizer: for np in list(doc.noun_chunks): @@ -184,7 +184,7 @@ def parse_ents(doc, options={}): for ent in doc.ents ] if not ents: - user_warning(Warnings.W006) + warnings.warn(Warnings.W006) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None settings = get_doc_settings(doc) return {"text": doc.text, "ents": ents, "title": title, "settings": settings} diff --git a/spacy/errors.py b/spacy/errors.py index e52241be1..664c0a2fc 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,11 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import os -import warnings -import inspect - - def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -93,8 +88,7 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data, " - "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " + "or the language you're using doesn't have lemmatization data. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " @@ -593,64 +587,3 @@ class MatchPatternError(ValueError): class AlignmentError(ValueError): pass - - -class ModelsWarning(UserWarning): - pass - - -WARNINGS = { - "user": UserWarning, - "deprecation": DeprecationWarning, - "models": ModelsWarning, -} - - -def _get_warn_types(arg): - if arg == "": # don't show any warnings - return [] - if not arg or arg == "all": # show all available warnings - return WARNINGS.keys() - return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS] - - -def _get_warn_excl(arg): - if not arg: - return [] - return [w_id.strip() for w_id in arg.split(",")] - - -SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER") -SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES")) -SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE")) - - -def user_warning(message): - _warn(message, "user") - - -def deprecation_warning(message): - _warn(message, "deprecation") - - -def models_warning(message): - _warn(message, "models") - - -def _warn(message, warn_type="user"): - """ - message (unicode): The message to display. - category (Warning): The Warning to show. - """ - if message.startswith("["): - w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string - else: - w_id = None - ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE - if warn_type in SPACY_WARNING_TYPES and not ignore_warning: - category = WARNINGS[warn_type] - stack = inspect.stack()[-1] - with warnings.catch_warnings(): - if SPACY_WARNING_FILTER: - warnings.simplefilter(SPACY_WARNING_FILTER, category) - warnings.warn_explicit(message, category, stack[1], stack[2]) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 8b61de683..e8274563f 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -10,10 +10,11 @@ import shutil import itertools from pathlib import Path import srsly +import warnings from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError, user_warning, Warnings +from .errors import Errors, AlignmentError, Warnings from .compat import path2str from . import util from .util import minibatch, itershuffle @@ -508,7 +509,7 @@ def _json_iterate(loc): py_raw = file_.read() cdef long file_length = len(py_raw) if file_length > 2 ** 30: - user_warning(Warnings.W027.format(size=file_length)) + warnings.warn(Warnings.W027.format(size=file_length)) raw = py_raw cdef int square_depth = 0 @@ -690,7 +691,7 @@ cdef class GoldParse: else: words_offset -= 1 if len(entities) != len(words): - user_warning(Warnings.W029.format(text=doc.text)) + warnings.warn(Warnings.W029.format(text=doc.text)) entities = ["-" for _ in words] # These are filled by the tagger/parser/entity recogniser diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..36a6dbd93 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,7 +1,9 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 -from spacy.errors import Errors, Warnings, user_warning +import warnings + +from spacy.errors import Errors, Warnings from pathlib import Path from cymem.cymem cimport Pool @@ -115,7 +117,7 @@ cdef class KnowledgeBase: # Return if this entity was added before if entity_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity)) + warnings.warn(Warnings.W018.format(entity=entity)) return # Raise an error if the provided entity vector is not of the correct length @@ -147,7 +149,7 @@ cdef class KnowledgeBase: # only process this entity if its unique ID hadn't been added before entity_hash = self.vocab.strings.add(entity_list[i]) if entity_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity_list[i])) + warnings.warn(Warnings.W018.format(entity=entity_list[i])) else: entity_vector = vector_list[i] @@ -195,7 +197,7 @@ cdef class KnowledgeBase: # Check whether this alias was added before if alias_hash in self._alias_index: - user_warning(Warnings.W017.format(alias=alias)) + warnings.warn(Warnings.W017.format(alias=alias)) return cdef vector[int64_t] entry_indices @@ -252,7 +254,7 @@ cdef class KnowledgeBase: if is_present: if not ignore_warnings: - user_warning(Warnings.W024.format(entity=entity, alias=alias)) + warnings.warn(Warnings.W024.format(entity=entity, alias=alias)) else: entry_indices.push_back(int(entry_index)) alias_entry.entry_indices = entry_indices diff --git a/spacy/language.py b/spacy/language.py index f5eff2ae9..e89f80f08 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,6 +3,7 @@ from __future__ import absolute_import, unicode_literals import random import itertools +import warnings from thinc.extra import load_nlp @@ -34,7 +35,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings from . import util from . import about @@ -758,10 +759,10 @@ class Language(object): DOCS: https://spacy.io/api/language#pipe """ if is_python2 and n_process != 1: - user_warning(Warnings.W023) + warnings.warn(Warnings.W023) n_process = 1 if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if n_process == -1: n_process = mp.cpu_count() if as_tuples: @@ -896,7 +897,7 @@ class Language(object): DOCS: https://spacy.io/api/language#to_disk """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) serializers = OrderedDict() @@ -929,7 +930,7 @@ class Language(object): DOCS: https://spacy.io/api/language#from_disk """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() @@ -964,7 +965,7 @@ class Language(object): DOCS: https://spacy.io/api/language#to_bytes """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable serializers = OrderedDict() serializers["vocab"] = lambda: self.vocab.to_bytes() @@ -989,7 +990,7 @@ class Language(object): DOCS: https://spacy.io/api/language#from_bytes """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = OrderedDict() deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 21644e37b..a081ffe42 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -9,6 +9,7 @@ cimport numpy as np np.import_array() import numpy +import warnings from thinc.neural.util import get_array_module from libc.stdint cimport UINT64_MAX @@ -19,7 +20,7 @@ from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from .attrs cimport IS_CURRENCY, IS_OOV, PROB from .attrs import intify_attrs -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings OOV_RANK = UINT64_MAX @@ -130,7 +131,7 @@ cdef class Lexeme: if self.c.orth == other[0].orth: return 1.0 if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Lexeme")) + warnings.warn(Warnings.W008.format(obj="Lexeme")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 9e0fe2812..7f3c3488f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -9,6 +9,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly +import warnings from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -20,7 +21,7 @@ from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA from ._schemas import TOKEN_PATTERN_SCHEMA from ..util import get_json_validator, validate_json -from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning +from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS @@ -195,7 +196,7 @@ cdef class Matcher: YIELDS (Doc): Documents, in order. """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in docs: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4de5782f9..b66ec35b8 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -6,13 +6,15 @@ from libc.stdint cimport uintptr_t from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter +import warnings + from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..structs cimport TokenC from ..tokens.token cimport Token from ..typedefs cimport attr_t from ._schemas import TOKEN_PATTERN_SCHEMA -from ..errors import Errors, Warnings, deprecation_warning, user_warning +from ..errors import Errors, Warnings cdef class PhraseMatcher: @@ -39,7 +41,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#init """ if max_length != 0: - deprecation_warning(Warnings.W010) + warnings.warn(Warnings.W010, DeprecationWarning) self.vocab = vocab self._callbacks = {} self._docs = {} @@ -195,7 +197,7 @@ cdef class PhraseMatcher: if self._validate and (doc.is_tagged or doc.is_parsed) \ and self.attr not in (DEP, POS, TAG, LEMMA): string_attr = self.vocab.strings[self.attr] - user_warning(Warnings.W012.format(key=key, attr=string_attr)) + warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) keyword = self._convert_to_array(doc) else: keyword = doc @@ -204,7 +206,7 @@ cdef class PhraseMatcher: current_node = self.c_map for token in keyword: if token == self._terminal_hash: - user_warning(Warnings.W021) + warnings.warn(Warnings.W021) break result = map_get(current_node, token) if not result: @@ -306,7 +308,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#pipe """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in stream: matches = self(doc) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f2a86d56e..982c058b4 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -6,6 +6,7 @@ from __future__ import unicode_literals import numpy import srsly import random +import warnings from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax @@ -32,7 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss -from ..errors import Errors, TempErrors, user_warning, Warnings +from ..errors import Errors, TempErrors, Warnings from .. import util @@ -514,7 +515,7 @@ class Tagger(Pipe): **kwargs): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): - user_warning(Warnings.W022) + warnings.warn(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 19d908529..6801d7844 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -6,7 +6,6 @@ import pytest import numpy from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.errors import ModelsWarning from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP from ..util import get_doc @@ -216,7 +215,7 @@ def test_doc_api_similarity_match(): assert doc.similarity(doc[0]) == 1.0 assert doc.similarity(doc.vocab["a"]) == 1.0 doc2 = Doc(doc.vocab, words=["a", "b", "c"]) - with pytest.warns(ModelsWarning): + with pytest.warns(UserWarning): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0 diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 917f22e9c..e76ca4697 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -5,7 +5,6 @@ import pytest from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.errors import ModelsWarning from spacy.util import filter_spans from ..util import get_doc @@ -124,7 +123,7 @@ def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) span1 = doc[:2] span2 = doc[2:] - with pytest.warns(ModelsWarning): + with pytest.warns(UserWarning): assert span1.similarity(span2) == 1.0 assert span1.similarity(doc) == 0.0 assert span1[:1].similarity(doc.vocab["a"]) == 1.0 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 62b8bbf4a..69d6285e1 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,6 +11,7 @@ cimport cython from collections import OrderedDict import re +import warnings from .tokens.doc cimport Doc from .strings cimport hash_string @@ -18,7 +19,7 @@ from .compat import unescape_unicode, basestring_ from .attrs import intify_attrs from .symbols import ORTH -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings from . import util @@ -115,7 +116,7 @@ cdef class Tokenizer: return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): - deprecation_warning(Warnings.W002) + warnings.warn(Warnings.W002, DeprecationWarning) return Doc(self.vocab, words=strings) @cython.boundscheck(False) @@ -181,7 +182,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#pipe """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) for text in texts: yield self(text) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f27115e6f..867c2bf6b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,6 +16,7 @@ import numpy.linalg import struct import srsly from thinc.neural.util import get_array_module, copy_array +import warnings from .span cimport Span from .token cimport Token @@ -29,7 +30,6 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS from ..util import normalize_slice from ..compat import is_config, copy_reg, pickle, basestring_ -from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings from .. import util from .underscore import Underscore, get_ext_args @@ -396,9 +396,9 @@ cdef class Doc: if similar: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Doc")) + warnings.warn(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Doc")) + warnings.warn(Warnings.W008.format(obj="Doc")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -787,7 +787,7 @@ cdef class Doc: attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] if array.dtype != numpy.uint64: - user_warning(Warnings.W028.format(type=array.dtype)) + warnings.warn(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -1040,10 +1040,10 @@ cdef class Doc: indices did not fall at token boundaries. """ cdef unicode tag, lemma, ent_type - deprecation_warning(Warnings.W013.format(obj="Doc")) + warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning) # TODO: ENT_KB_ID ? if len(args) == 3: - deprecation_warning(Warnings.W003) + warnings.warn(Warnings.W003, DeprecationWarning) tag, lemma, ent_type = args attributes[TAG] = tag attributes[LEMMA] = lemma @@ -1183,7 +1183,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: while not heads_within_sents: heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) if loop_count > 10: - user_warning(Warnings.W026) + warnings.warn(Warnings.W026) break loop_count += 1 # Set sentence starts diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9269700b0..347916a0a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -6,6 +6,7 @@ from libc.math cimport sqrt import numpy import numpy.linalg +import warnings from thinc.neural.util import get_array_module from collections import defaultdict @@ -21,8 +22,7 @@ from ..symbols cimport dep from ..util import normalize_slice from ..compat import is_config, basestring_ -from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning -from ..errors import deprecation_warning +from ..errors import Errors, TempErrors, Warnings from .underscore import Underscore, get_ext_args @@ -292,7 +292,7 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ - deprecation_warning(Warnings.W013.format(obj="Span")) + warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning) return self.doc.merge(self.start_char, self.end_char, *args, **attributes) @@ -333,9 +333,9 @@ cdef class Span: if similar: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Span")) + warnings.warn(Warnings.W007.format(obj="Span")) if self.vector_norm == 0.0 or other.vector_norm == 0.0: - user_warning(Warnings.W008.format(obj="Span")) + warnings.warn(Warnings.W008.format(obj="Span")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8019e3b4f..efd9aa10b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -10,6 +10,7 @@ cimport numpy as np np.import_array() import numpy +import warnings from thinc.neural.util import get_array_module from ..typedefs cimport hash_t @@ -24,7 +25,7 @@ from ..symbols cimport conj from .. import parts_of_speech from .. import util from ..compat import is_config -from ..errors import Errors, Warnings, user_warning, models_warning +from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args from .morphanalysis cimport MorphAnalysis @@ -211,9 +212,9 @@ cdef class Token: if self.c.lex.orth == other.orth: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Token")) + warnings.warn(Warnings.W007.format(obj="Token")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Token")) + warnings.warn(Warnings.W008.format(obj="Token")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/util.py b/spacy/util.py index a5e27a210..7f2e0058f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -30,7 +30,7 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import import_file -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings _data_path = Path(__file__).parent / "data" @@ -749,7 +749,7 @@ def get_serialization_exclude(serializers, exclude, kwargs): options = [name.split(".")[0] for name in serializers] for key, value in kwargs.items(): if key in ("vocab",) and value is False: - deprecation_warning(Warnings.W015.format(arg=key)) + warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning) exclude.append(key) elif key.split(".")[0] in options: raise ValueError(Errors.E128.format(arg=key)) From 3a045572ed1608daa90dc92229c2da0524fa7f20 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Apr 2020 13:48:37 +0200 Subject: [PATCH 065/131] Add missing import --- spacy/displacy/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 8a6ec2f53..a0cccbbde 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -7,6 +7,8 @@ USAGE: https://spacy.io/usage/visualizers """ from __future__ import unicode_literals +import warnings + from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span from ..compat import b_to_str From ac40a8f7a53a29865707a4732e35c8675f1b1abb Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 28 Apr 2020 14:00:11 +0200 Subject: [PATCH 066/131] Add missing import --- spacy/cli/init_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 32d2d974e..2e0aeb239 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -12,6 +12,7 @@ import tarfile import gzip import zipfile import srsly +import warnings from wasabi import msg from ..vectors import Vectors From d5f18f83077487011f794444bbdf873b3bca7271 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 28 Apr 2020 14:01:29 +0200 Subject: [PATCH 067/131] Add missing import --- spacy/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/util.py b/spacy/util.py index 7f2e0058f..609c0b572 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -16,6 +16,7 @@ import numpy import srsly import catalogue import sys +import warnings try: import jsonschema From a27c4014f557814854bd0324e0355603de29b8b3 Mon Sep 17 00:00:00 2001 From: Louis Guitton Date: Wed, 29 Apr 2020 10:18:03 +0200 Subject: [PATCH 068/131] Add mlflow to spaCy universe (#5352) * Add mlflow to universe * Use mlflow black logo --- .github/contributors/louisguitton.md | 106 +++++++++++++++++++++++++++ website/meta/universe.json | 35 +++++++++ 2 files changed, 141 insertions(+) create mode 100644 .github/contributors/louisguitton.md diff --git a/.github/contributors/louisguitton.md b/.github/contributors/louisguitton.md new file mode 100644 index 000000000..8c5f30df6 --- /dev/null +++ b/.github/contributors/louisguitton.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Louis Guitton | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-25 | +| GitHub username | louisguitton | +| Website (optional) | https://guitton.co/ | diff --git a/website/meta/universe.json b/website/meta/universe.json index 8da96a026..bd3191492 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2117,6 +2117,41 @@ "github": "thomasthiebaud" }, "category": ["pipeline"] + }, + { + "id": "mlflow", + "title": "MLflow", + "slogan": "An open source platform for the machine learning lifecycle", + "description": "MLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. MLflow currently offers four components: Tracking, Projects, Models and Registry.", + "github": "mlflow/mlflow", + "pip": "mlflow", + "thumb": "https://www.mlflow.org/docs/latest/_static/MLflow-logo-final-black.png", + "image": "", + "url": "https://mlflow.org/", + "author": "Databricks", + "author_links": { + "github": "databricks", + "twitter": "databricks", + "website": "https://databricks.com/" + }, + "category": ["standalone", "apis"], + "code_example": [ + "import mlflow", + "import mlflow.spacy", + "", + "# MLflow Tracking", + "nlp = spacy.load('my_best_model_path/output/model-best')", + "with mlflow.start_run(run_name='Spacy'):", + " mlflow.set_tag('model_flavor', 'spacy')", + " mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model')", + " mlflow.log_metric(('accuracy', 0.72))", + " my_run_id = mlflow.active_run().info.run_id", + "", + "", + "# MLflow Models", + "model_uri = f'runs:/{my_run_id}/model'", + "nlp2 = mlflow.spacy.load_model(model_uri=model_uri)" + ] } ], From 90ce34db42bedac8fa6b3d614d8bc568b883d6da Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:51:12 +0200 Subject: [PATCH 069/131] Add cuda101 and cuda102 options to setup (#5377) * Add cuda101 and cuda102 options to setup * Update cudaNNN options in docs --- setup.cfg | 4 ++++ website/docs/usage/index.md | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 722adc0e2..3e0acd12f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,6 +72,10 @@ cuda92 = cupy-cuda92>=5.0.0b4,<9.0.0 cuda100 = cupy-cuda100>=5.0.0b4,<9.0.0 +cuda101 = + cupy-cuda101>=5.0.0b4,<9.0.0 +cuda102 = + cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = fugashi>=0.1.3 diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 17fd8fa7b..d0172104b 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -122,10 +122,10 @@ support, we've been grateful to use the work of Chainer's interface for GPU arrays. spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`, -`spacy[cuda91]`, `spacy[cuda92]` or `spacy[cuda100]`. If you know your cuda -version, using the more explicit specifier allows cupy to be installed via -wheel, saving some compilation time. The specifiers should install -[`cupy`](https://cupy.chainer.org). +`spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]` or +`spacy[cuda102]`. If you know your cuda version, using the more explicit +specifier allows cupy to be installed via wheel, saving some compilation time. +The specifiers should install [`cupy`](https://cupy.chainer.org). ```bash $ pip install -U spacy[cuda92] From 732629b0dd8ab4db2b5446aa246ebe65f30ae2c2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Apr 2020 12:51:37 +0200 Subject: [PATCH 070/131] Update website/meta/universe.json --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 6c9fc0340..139f1e8e8 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1623,7 +1623,7 @@ "id": "pic2phrase_bot", "title": "pic2phrase_bot: Photo Description Generator", "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", - "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy." + "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy.", "thumb": "https://i.imgur.com/ggVI02O.jpg", "image": "https://i.imgur.com/z1yhWQR.jpg", "url": "https://telegram.me/pic2phrase_bot", From 1cbb272a6b468f1704f00f00d126104eb4ddec12 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Apr 2020 12:51:44 +0200 Subject: [PATCH 071/131] Update website/meta/universe.json --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 139f1e8e8..8c8274700 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1631,7 +1631,7 @@ "author_links": { "twitter": "VasilievYuli", }, - "category": ["standalone", "research"] + "category": ["standalone", "conversational"] }, { "id": "gracyql", From a6e521cd7919ed16b6bcc089aadbac8b5d160fd1 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:53:16 +0200 Subject: [PATCH 072/131] Add is_sent_end token property (#5375) Reconstruction of the original PR #4697 by @MiniLau. Removes unused `SENT_END` symbol and `IS_SENT_END` from `Matcher` schema because the Matcher is only going to be able to support `IS_SENT_START`. --- .github/contributors/MiniLau.md | 106 +++++++++++++++++++++++ spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/errors.py | 2 + spacy/structs.pxd | 2 +- spacy/symbols.pxd | 2 +- spacy/tests/doc/test_token_api.py | 14 +++ spacy/tests/pipeline/test_sentencizer.py | 17 +++- spacy/tokens/token.pyx | 22 +++++ website/docs/api/token.md | 17 +++- 10 files changed, 177 insertions(+), 7 deletions(-) create mode 100644 .github/contributors/MiniLau.md diff --git a/.github/contributors/MiniLau.md b/.github/contributors/MiniLau.md new file mode 100644 index 000000000..14d6fe328 --- /dev/null +++ b/.github/contributors/MiniLau.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Desausoi Laurent | +| Company name (if applicable) | / | +| Title or role (if applicable) | / | +| Date | 22 November 2019 | +| GitHub username | MiniLau | +| Website (optional) | / | diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 4638fcb82..8f583b3a3 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -94,3 +94,4 @@ cdef enum attr_id_t: ENT_ID = symbols.ENT_ID IDX + SENT_END \ No newline at end of file diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index f14cd6ddc..2187f3c65 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -88,6 +88,7 @@ IDS = { "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, + "SENT_END": SENT_END, "SPACY": SPACY, "PROB": PROB, "LANG": LANG, diff --git a/spacy/errors.py b/spacy/errors.py index e52241be1..6191570ee 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -559,6 +559,8 @@ class Errors(object): "({curr_dim}).") E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") E195 = ("Matcher can be called on {good} only, got {got}.") + E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " + "only be fixed with token.is_sent_start.") @add_codes diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b3878db3f..b8e63a725 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -84,7 +84,7 @@ cdef struct TokenC: cdef struct MorphAnalysisC: univ_pos_t pos int length - + attr_t abbr attr_t adp_type attr_t adv_type diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b24891fdd..9229c9970 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -464,4 +464,4 @@ cdef enum symbol_t: ENT_KB_ID ENT_ID - IDX \ No newline at end of file + IDX diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 8c749b26d..1c2253dfa 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -181,6 +181,14 @@ def test_is_sent_start(en_tokenizer): doc.is_parsed = True assert len(list(doc.sents)) == 2 +def test_is_sent_end(en_tokenizer): + doc = en_tokenizer("This is a sentence. This is another.") + assert doc[4].is_sent_end is None + doc[5].is_sent_start = True + assert doc[4].is_sent_end is True + doc.is_parsed = True + assert len(list(doc.sents)) == 2 + def test_set_pos(): doc = Doc(Vocab(), words=["hello", "world"]) @@ -205,6 +213,12 @@ def test_token0_has_sent_start_true(): assert doc[1].is_sent_start is None assert not doc.is_sentenced +def test_tokenlast_has_sent_end_true(): + doc = Doc(Vocab(), words=["hello", "world"]) + assert doc[0].is_sent_end is None + assert doc[1].is_sent_end is True + assert not doc.is_sentenced + def test_token_api_conjuncts_chain(en_vocab): words = "The boy and the girl and the man went .".split() diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index d690958cc..7e58b3e98 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -14,7 +14,9 @@ def test_sentencizer(en_vocab): doc = sentencizer(doc) assert doc.is_sentenced sent_starts = [t.is_sent_start for t in doc] + sent_ends = [t.is_sent_end for t in doc] assert sent_starts == [True, False, True, False, False, False, False] + assert sent_ends == [False, True, False, False, False, False, True] assert len(list(doc.sents)) == 2 @@ -46,13 +48,14 @@ def test_sentencizer_empty_docs(): @pytest.mark.parametrize( - "words,sent_starts,n_sents", + "words,sent_starts,sent_ends,n_sents", [ # The expected result here is that the duplicate punctuation gets merged # onto the same sentence and no one-token sentence is created for them. ( ["Hello", "!", ".", "Test", ".", ".", "ok"], [True, False, False, True, False, False, True], + [False, False, True, False, False, True, True], 3, ), # We also want to make sure ¡ and ¿ aren't treated as sentence end @@ -60,32 +63,36 @@ def test_sentencizer_empty_docs(): ( ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"], [True, False, False, False, True, False, False, False, False, False], + [False, False, False, True, False, False, False, False, False, True], 2, ), # The Token.is_punct check ensures that quotes are handled as well ( ['"', "Nice", "!", '"', "I", "am", "happy", "."], [True, False, False, False, True, False, False, False], + [False, False, False, True, False, False, False, True], 2, ), ], ) -def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): +def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer() doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts + assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents @pytest.mark.parametrize( - "punct_chars,words,sent_starts,n_sents", + "punct_chars,words,sent_starts,sent_ends,n_sents", [ ( ["~", "?"], ["Hello", "world", "~", "A", ".", "B", "."], [True, False, False, True, False, False, False], + [False, False, True, False, False, False, True], 2, ), # Even thought it's not common, the punct_chars should be able to @@ -94,16 +101,18 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): [".", "ö"], ["Hello", ".", "Test", "ö", "Ok", "."], [True, False, True, False, True, False], + [False, True, False, True, False, True], 3, ), ], ) -def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents): +def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=punct_chars) doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts + assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8019e3b4f..194f16c5a 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -493,6 +493,28 @@ cdef class Token: else: raise ValueError(Errors.E044.format(value=value)) + property is_sent_end: + """A boolean value indicating whether the token ends a sentence. + `None` if unknown. Defaults to `True` for the last token in the `Doc`. + + RETURNS (bool / None): Whether the token ends a sentence. + None if unknown. + + DOCS: https://spacy.io/api/token#is_sent_end + """ + def __get__(self): + if self.i + 1 == len(self.doc): + return True + elif self.doc[self.i+1].is_sent_start == None: + return None + elif self.doc[self.i+1].is_sent_start == True: + return True + else: + return False + + def __set__(self, value): + raise ValueError(Errors.E196) + @property def lefts(self): """The leftward immediate children of the word, in the syntactic diff --git a/website/docs/api/token.md b/website/docs/api/token.md index c30c01c20..7280ac796 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -352,7 +352,22 @@ property to `0` for the first word of the document. + assert doc[4].is_sent_start == True ``` - +## Token.is_sent_end {#is_sent_end tag="property" new="2"} + +A boolean value indicating whether the token ends a sentence. `None` if +unknown. Defaults to `True` for the last token in the `Doc`. + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> assert doc[3].is_sent_end +> assert not doc[4].is_sent_end +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------ | +| **RETURNS** | bool | Whether the token ends a sentence. | ## Token.has_vector {#has_vector tag="property" model="vectors"} From f67343295de38be3f88360f009e99de7eb2e199c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Apr 2020 12:53:53 +0200 Subject: [PATCH 073/131] Update NEL examples and documentation (#5370) * simplify creation of KB by skipping dim reduction * small fixes to train EL example script * add KB creation and NEL training example scripts to example section * update descriptions of example scripts in the documentation * moving wiki_entity_linking folder from bin to projects * remove test for wiki NEL functionality that is being moved --- bin/wiki_entity_linking/README.md | 37 -- bin/wiki_entity_linking/__init__.py | 12 - .../entity_linker_evaluation.py | 204 ------- bin/wiki_entity_linking/kb_creator.py | 161 ----- bin/wiki_entity_linking/train_descriptions.py | 152 ----- bin/wiki_entity_linking/wiki_io.py | 127 ---- bin/wiki_entity_linking/wiki_namespaces.py | 128 ---- .../wikidata_pretrain_kb.py | 179 ------ bin/wiki_entity_linking/wikidata_processor.py | 154 ----- .../wikidata_train_entity_linker.py | 172 ------ .../wikipedia_processor.py | 565 ------------------ .../training/{pretrain_kb.py => create_kb.py} | 43 +- examples/training/train_entity_linker.py | 10 +- spacy/tests/regression/test_issue5314.py | 18 - website/docs/usage/examples.md | 21 + website/docs/usage/linguistic-features.md | 4 +- website/docs/usage/training.md | 22 +- 17 files changed, 50 insertions(+), 1959 deletions(-) delete mode 100644 bin/wiki_entity_linking/README.md delete mode 100644 bin/wiki_entity_linking/__init__.py delete mode 100644 bin/wiki_entity_linking/entity_linker_evaluation.py delete mode 100644 bin/wiki_entity_linking/kb_creator.py delete mode 100644 bin/wiki_entity_linking/train_descriptions.py delete mode 100644 bin/wiki_entity_linking/wiki_io.py delete mode 100644 bin/wiki_entity_linking/wiki_namespaces.py delete mode 100644 bin/wiki_entity_linking/wikidata_pretrain_kb.py delete mode 100644 bin/wiki_entity_linking/wikidata_processor.py delete mode 100644 bin/wiki_entity_linking/wikidata_train_entity_linker.py delete mode 100644 bin/wiki_entity_linking/wikipedia_processor.py rename examples/training/{pretrain_kb.py => create_kb.py} (75%) delete mode 100644 spacy/tests/regression/test_issue5314.py diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md deleted file mode 100644 index 4e4af5c21..000000000 --- a/bin/wiki_entity_linking/README.md +++ /dev/null @@ -1,37 +0,0 @@ -## Entity Linking with Wikipedia and Wikidata - -### Step 1: Create a Knowledge Base (KB) and training data - -Run `wikidata_pretrain_kb.py` -* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file** - * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/ - * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language) -* You can set the filtering parameters for KB construction: - * `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym - * `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB - * `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB -* Further parameters to set: - * `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`) - * `entity_vector_length` (`-v`): length of the pre-trained entity description vectors - * `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages) - -Quick testing and rerunning: -* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything. - * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1` -* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed. - - -### Step 2: Train an Entity Linking model - -Run `wikidata_train_entity_linker.py` -* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model** -* Specify the output directory (`-o`) in which the final, trained model will be saved -* You can set the learning parameters for the EL training: - * `epochs` (`-e`): number of training iterations - * `dropout` (`-p`): dropout rate - * `lr` (`-n`): learning rate - * `l2` (`-r`): L2 regularization -* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively - * If not specified, the full dataset will be processed - this may take a LONG time ! -* Further parameters to set: - * `labels_discard` (`-l`): NER label types to discard during training diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py deleted file mode 100644 index de486bbcf..000000000 --- a/bin/wiki_entity_linking/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -TRAINING_DATA_FILE = "gold_entities.jsonl" -KB_FILE = "kb" -KB_MODEL_DIR = "nlp_kb" -OUTPUT_MODEL_DIR = "nlp" - -PRIOR_PROB_PATH = "prior_prob.csv" -ENTITY_DEFS_PATH = "entity_defs.csv" -ENTITY_FREQ_PATH = "entity_freq.csv" -ENTITY_ALIAS_PATH = "entity_alias.csv" -ENTITY_DESCR_PATH = "entity_descriptions.csv" - -LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py deleted file mode 100644 index 2aeffbfc2..000000000 --- a/bin/wiki_entity_linking/entity_linker_evaluation.py +++ /dev/null @@ -1,204 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import logging -import random -from tqdm import tqdm -from collections import defaultdict - -logger = logging.getLogger(__name__) - - -class Metrics(object): - true_pos = 0 - false_pos = 0 - false_neg = 0 - - def update_results(self, true_entity, candidate): - candidate_is_correct = true_entity == candidate - - # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL") - # Therefore, if candidate_is_correct then we have a true positive and never a true negative. - self.true_pos += candidate_is_correct - self.false_neg += not candidate_is_correct - if candidate and candidate not in {"", "NIL"}: - # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN. - self.false_pos += not candidate_is_correct - - def calculate_precision(self): - if self.true_pos == 0: - return 0.0 - else: - return self.true_pos / (self.true_pos + self.false_pos) - - def calculate_recall(self): - if self.true_pos == 0: - return 0.0 - else: - return self.true_pos / (self.true_pos + self.false_neg) - - def calculate_fscore(self): - p = self.calculate_precision() - r = self.calculate_recall() - if p + r == 0: - return 0.0 - else: - return 2 * p * r / (p + r) - - -class EvaluationResults(object): - def __init__(self): - self.metrics = Metrics() - self.metrics_by_label = defaultdict(Metrics) - - def update_metrics(self, ent_label, true_entity, candidate): - self.metrics.update_results(true_entity, candidate) - self.metrics_by_label[ent_label].update_results(true_entity, candidate) - - def report_metrics(self, model_name): - model_str = model_name.title() - recall = self.metrics.calculate_recall() - precision = self.metrics.calculate_precision() - fscore = self.metrics.calculate_fscore() - return ( - "{}: ".format(model_str) - + "F-score = {} | ".format(round(fscore, 3)) - + "Recall = {} | ".format(round(recall, 3)) - + "Precision = {} | ".format(round(precision, 3)) - + "F-score by label = {}".format( - {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())} - ) - ) - - -class BaselineResults(object): - def __init__(self): - self.random = EvaluationResults() - self.prior = EvaluationResults() - self.oracle = EvaluationResults() - - def report_performance(self, model): - results = getattr(self, model) - return results.report_metrics(model) - - def update_baselines( - self, - true_entity, - ent_label, - random_candidate, - prior_candidate, - oracle_candidate, - ): - self.oracle.update_metrics(ent_label, true_entity, oracle_candidate) - self.prior.update_metrics(ent_label, true_entity, prior_candidate) - self.random.update_metrics(ent_label, true_entity, random_candidate) - - -def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None): - counts = dict() - baseline_results = BaselineResults() - context_results = EvaluationResults() - combo_results = EvaluationResults() - - for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'): - if len(doc) > 0: - correct_ents = dict() - for entity, kb_dict in gold.links.items(): - start, end = entity - for gold_kb, value in kb_dict.items(): - if value: - # only evaluating on positive examples - offset = _offset(start, end) - correct_ents[offset] = gold_kb - - if baseline: - _add_baseline(baseline_results, counts, doc, correct_ents, kb) - - if context: - # using only context - el_pipe.cfg["incl_context"] = True - el_pipe.cfg["incl_prior"] = False - _add_eval_result(context_results, doc, correct_ents, el_pipe) - - # measuring combined accuracy (prior + context) - el_pipe.cfg["incl_context"] = True - el_pipe.cfg["incl_prior"] = True - _add_eval_result(combo_results, doc, correct_ents, el_pipe) - - if baseline: - logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())})) - logger.info(baseline_results.report_performance("random")) - logger.info(baseline_results.report_performance("prior")) - logger.info(baseline_results.report_performance("oracle")) - - if context: - logger.info(context_results.report_metrics("context only")) - logger.info(combo_results.report_metrics("context and prior")) - - -def _add_eval_result(results, doc, correct_ents, el_pipe): - """ - Evaluate the ent.kb_id_ annotations against the gold standard. - Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. - """ - try: - doc = el_pipe(doc) - for ent in doc.ents: - ent_label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_ents.get(offset, None) - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - pred_entity = ent.kb_id_ - results.update_metrics(ent_label, gold_entity, pred_entity) - - except Exception as e: - logging.error("Error assessing accuracy " + str(e)) - - -def _add_baseline(baseline_results, counts, doc, correct_ents, kb): - """ - Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound. - Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. - """ - for ent in doc.ents: - ent_label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_ents.get(offset, None) - - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - candidates = kb.get_candidates(ent.text) - oracle_candidate = "" - prior_candidate = "" - random_candidate = "" - if candidates: - scores = [] - - for c in candidates: - scores.append(c.prior_prob) - if c.entity_ == gold_entity: - oracle_candidate = c.entity_ - - best_index = scores.index(max(scores)) - prior_candidate = candidates[best_index].entity_ - random_candidate = random.choice(candidates).entity_ - - current_count = counts.get(ent_label, 0) - counts[ent_label] = current_count+1 - - baseline_results.update_baselines( - gold_entity, - ent_label, - random_candidate, - prior_candidate, - oracle_candidate, - ) - - -def _offset(start, end): - return "{}_{}".format(start, end) diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py deleted file mode 100644 index 7778fc701..000000000 --- a/bin/wiki_entity_linking/kb_creator.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import logging - -from spacy.kb import KnowledgeBase - -from bin.wiki_entity_linking.train_descriptions import EntityEncoder -from bin.wiki_entity_linking import wiki_io as io - - -logger = logging.getLogger(__name__) - - -def create_kb( - nlp, - max_entities_per_alias, - min_entity_freq, - min_occ, - entity_def_path, - entity_descr_path, - entity_alias_path, - entity_freq_path, - prior_prob_path, - entity_vector_length, -): - # Create the knowledge base from Wikidata entries - kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) - entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length) - _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path) - return kb - - -def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length): - # read the mappings from file - title_to_id = io.read_title_to_id(entity_def_path) - id_to_descr = io.read_id_to_descr(entity_descr_path) - - # check the length of the nlp vectors - if "vectors" in nlp.meta and nlp.vocab.vectors.size: - input_dim = nlp.vocab.vectors_length - logger.info("Loaded pretrained vectors of size %s" % input_dim) - else: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq)) - entity_frequencies = io.read_entity_to_count(entity_freq_path) - # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise - filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( - title_to_id, - id_to_descr, - entity_frequencies, - min_entity_freq - ) - logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys()))) - - logger.info("Training entity encoder") - encoder = EntityEncoder(nlp, input_dim, entity_vector_length) - encoder.train(description_list=description_list, to_print=True) - - logger.info("Getting entity embeddings") - embeddings = encoder.apply_encoder(description_list) - - logger.info("Adding {} entities".format(len(entity_list))) - kb.set_entities( - entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings - ) - return entity_list, filtered_title_to_id - - -def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path): - logger.info("Adding aliases from Wikipedia and Wikidata") - _add_aliases( - kb, - entity_list=entity_list, - title_to_id=filtered_title_to_id, - max_entities_per_alias=max_entities_per_alias, - min_occ=min_occ, - prior_prob_path=prior_prob_path, - ) - - -def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies, - min_entity_freq: int = 10): - filtered_title_to_id = dict() - entity_list = [] - description_list = [] - frequency_list = [] - for title, entity in title_to_id.items(): - freq = entity_frequencies.get(title, 0) - desc = id_to_descr.get(entity, None) - if desc and freq > min_entity_freq: - entity_list.append(entity) - description_list.append(desc) - frequency_list.append(freq) - filtered_title_to_id[title] = entity - return filtered_title_to_id, entity_list, description_list, frequency_list - - -def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path): - wp_titles = title_to_id.keys() - - # adding aliases with prior probabilities - # we can read this file sequentially, it's sorted by alias, and then by count - logger.info("Adding WP aliases") - with prior_prob_path.open("r", encoding="utf8") as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - previous_alias = None - total_count = 0 - counts = [] - entities = [] - while line: - splits = line.replace("\n", "").split(sep="|") - new_alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - if new_alias != previous_alias and previous_alias: - # done reading the previous alias --> output - if len(entities) > 0: - selected_entities = [] - prior_probs = [] - for ent_count, ent_string in zip(counts, entities): - if ent_string in wp_titles: - wd_id = title_to_id[ent_string] - p_entity_givenalias = ent_count / total_count - selected_entities.append(wd_id) - prior_probs.append(p_entity_givenalias) - - if selected_entities: - try: - kb.add_alias( - alias=previous_alias, - entities=selected_entities, - probabilities=prior_probs, - ) - except ValueError as e: - logger.error(e) - total_count = 0 - counts = [] - entities = [] - - total_count += count - - if len(entities) < max_entities_per_alias and count >= min_occ: - counts.append(count) - entities.append(entity) - previous_alias = new_alias - - line = prior_file.readline() - - -def read_kb(nlp, kb_file): - kb = KnowledgeBase(vocab=nlp.vocab) - kb.load_bulk(kb_file) - return kb diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py deleted file mode 100644 index af08d6b8f..000000000 --- a/bin/wiki_entity_linking/train_descriptions.py +++ /dev/null @@ -1,152 +0,0 @@ -# coding: utf-8 -from random import shuffle - -import logging -import numpy as np - -from spacy._ml import zero_init, create_default_optimizer -from spacy.cli.pretrain import get_cossim_loss - -from thinc.v2v import Model -from thinc.api import chain -from thinc.neural._classes.affine import Affine - -logger = logging.getLogger(__name__) - - -class EntityEncoder: - """ - Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D). - This entity vector will be stored in the KB, for further downstream use in the entity model. - """ - - DROP = 0 - BATCH_SIZE = 1000 - - # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy - MIN_LOSS = 0.01 - - # Reasonable default to stop training when things are not improving - MAX_NO_IMPROVEMENT = 20 - - def __init__(self, nlp, input_dim, desc_width, epochs=5): - self.nlp = nlp - self.input_dim = input_dim - self.desc_width = desc_width - self.epochs = epochs - - def apply_encoder(self, description_list): - if self.encoder is None: - raise ValueError("Can not apply encoder before training it") - - batch_size = 100000 - - start = 0 - stop = min(batch_size, len(description_list)) - encodings = [] - - while start < len(description_list): - docs = list(self.nlp.pipe(description_list[start:stop])) - doc_embeddings = [self._get_doc_embedding(doc) for doc in docs] - enc = self.encoder(np.asarray(doc_embeddings)) - encodings.extend(enc.tolist()) - - start = start + batch_size - stop = min(stop + batch_size, len(description_list)) - logger.info("Encoded: {} entities".format(stop)) - - return encodings - - def train(self, description_list, to_print=False): - processed, loss = self._train_model(description_list) - if to_print: - logger.info( - "Trained entity descriptions on {} ".format(processed) + - "(non-unique) descriptions across {} ".format(self.epochs) + - "epochs" - ) - logger.info("Final loss: {}".format(loss)) - - def _train_model(self, description_list): - best_loss = 1.0 - iter_since_best = 0 - self._build_network(self.input_dim, self.desc_width) - - processed = 0 - loss = 1 - # copy this list so that shuffling does not affect other functions - descriptions = description_list.copy() - to_continue = True - - for i in range(self.epochs): - shuffle(descriptions) - - batch_nr = 0 - start = 0 - stop = min(self.BATCH_SIZE, len(descriptions)) - - while to_continue and start < len(descriptions): - batch = [] - for descr in descriptions[start:stop]: - doc = self.nlp(descr) - doc_vector = self._get_doc_embedding(doc) - batch.append(doc_vector) - - loss = self._update(batch) - if batch_nr % 25 == 0: - logger.info("loss: {} ".format(loss)) - processed += len(batch) - - # in general, continue training if we haven't reached our ideal min yet - to_continue = loss > self.MIN_LOSS - - # store the best loss and track how long it's been - if loss < best_loss: - best_loss = loss - iter_since_best = 0 - else: - iter_since_best += 1 - - # stop learning if we haven't seen improvement since the last few iterations - if iter_since_best > self.MAX_NO_IMPROVEMENT: - to_continue = False - - batch_nr += 1 - start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(descriptions)) - - return processed, loss - - @staticmethod - def _get_doc_embedding(doc): - indices = np.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in doc.vocab.vectors.key2row: - indices[i] = doc.vocab.vectors.key2row[word.orth] - else: - indices[i] = 0 - word_vectors = doc.vocab.vectors.data[indices] - doc_vector = np.mean(word_vectors, axis=0) - return doc_vector - - def _build_network(self, orig_width, hidden_with): - with Model.define_operators({">>": chain}): - # very simple encoder-decoder model - self.encoder = Affine(hidden_with, orig_width) - self.model = self.encoder >> zero_init( - Affine(orig_width, hidden_with, drop_factor=0.0) - ) - self.sgd = create_default_optimizer(self.model.ops) - - def _update(self, vectors): - predictions, bp_model = self.model.begin_update( - np.asarray(vectors), drop=self.DROP - ) - loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) - bp_model(d_scores, sgd=self.sgd) - return loss / len(vectors) - - @staticmethod - def _get_loss(golds, scores): - loss, gradients = get_cossim_loss(scores, golds) - return loss, gradients diff --git a/bin/wiki_entity_linking/wiki_io.py b/bin/wiki_entity_linking/wiki_io.py deleted file mode 100644 index 43ae87f0f..000000000 --- a/bin/wiki_entity_linking/wiki_io.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import sys -import csv - -# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/ -csv.field_size_limit(min(sys.maxsize, 2147483646)) - -""" This class provides reading/writing methods for temp files """ - - -# Entity definition: WP title -> WD ID # -def write_title_to_id(entity_def_output, title_to_id): - with entity_def_output.open("w", encoding="utf8") as id_file: - id_file.write("WP_title" + "|" + "WD_id" + "\n") - for title, qid in title_to_id.items(): - id_file.write(title + "|" + str(qid) + "\n") - - -def read_title_to_id(entity_def_output): - title_to_id = dict() - with entity_def_output.open("r", encoding="utf8") as id_file: - csvreader = csv.reader(id_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - title_to_id[row[0]] = row[1] - return title_to_id - - -# Entity aliases from WD: WD ID -> WD alias # -def write_id_to_alias(entity_alias_path, id_to_alias): - with entity_alias_path.open("w", encoding="utf8") as alias_file: - alias_file.write("WD_id" + "|" + "alias" + "\n") - for qid, alias_list in id_to_alias.items(): - for alias in alias_list: - alias_file.write(str(qid) + "|" + alias + "\n") - - -def read_id_to_alias(entity_alias_path): - id_to_alias = dict() - with entity_alias_path.open("r", encoding="utf8") as alias_file: - csvreader = csv.reader(alias_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - qid = row[0] - alias = row[1] - alias_list = id_to_alias.get(qid, []) - alias_list.append(alias) - id_to_alias[qid] = alias_list - return id_to_alias - - -def read_alias_to_id_generator(entity_alias_path): - """ Read (aliases, qid) tuples """ - - with entity_alias_path.open("r", encoding="utf8") as alias_file: - csvreader = csv.reader(alias_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - qid = row[0] - alias = row[1] - yield alias, qid - - -# Entity descriptions from WD: WD ID -> WD alias # -def write_id_to_descr(entity_descr_output, id_to_descr): - with entity_descr_output.open("w", encoding="utf8") as descr_file: - descr_file.write("WD_id" + "|" + "description" + "\n") - for qid, descr in id_to_descr.items(): - descr_file.write(str(qid) + "|" + descr + "\n") - - -def read_id_to_descr(entity_desc_path): - id_to_desc = dict() - with entity_desc_path.open("r", encoding="utf8") as descr_file: - csvreader = csv.reader(descr_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - id_to_desc[row[0]] = row[1] - return id_to_desc - - -# Entity counts from WP: WP title -> count # -def write_entity_to_count(prior_prob_input, count_output): - # Write entity counts for quick access later - entity_to_count = dict() - total_count = 0 - - with prior_prob_input.open("r", encoding="utf8") as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - - while line: - splits = line.replace("\n", "").split(sep="|") - # alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - current_count = entity_to_count.get(entity, 0) - entity_to_count[entity] = current_count + count - - total_count += count - - line = prior_file.readline() - - with count_output.open("w", encoding="utf8") as entity_file: - entity_file.write("entity" + "|" + "count" + "\n") - for entity, count in entity_to_count.items(): - entity_file.write(entity + "|" + str(count) + "\n") - - -def read_entity_to_count(count_input): - entity_to_count = dict() - with count_input.open("r", encoding="utf8") as csvfile: - csvreader = csv.reader(csvfile, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - entity_to_count[row[0]] = int(row[1]) - - return entity_to_count diff --git a/bin/wiki_entity_linking/wiki_namespaces.py b/bin/wiki_entity_linking/wiki_namespaces.py deleted file mode 100644 index e8f099ccd..000000000 --- a/bin/wiki_entity_linking/wiki_namespaces.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# List of meta pages in Wikidata, should be kept out of the Knowledge base -WD_META_ITEMS = [ - "Q163875", - "Q191780", - "Q224414", - "Q4167836", - "Q4167410", - "Q4663903", - "Q11266439", - "Q13406463", - "Q15407973", - "Q18616576", - "Q19887878", - "Q22808320", - "Q23894233", - "Q33120876", - "Q42104522", - "Q47460393", - "Q64875536", - "Q66480449", -] - - -# TODO: add more cases from non-English WP's - -# List of prefixes that refer to Wikipedia "file" pages -WP_FILE_NAMESPACE = ["Bestand", "File"] - -# List of prefixes that refer to Wikipedia "category" pages -WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"] - -# List of prefixes that refer to Wikipedia "meta" pages -# these will/should be matched ignoring case -WP_META_NAMESPACE = ( - WP_FILE_NAMESPACE - + WP_CATEGORY_NAMESPACE - + [ - "b", - "betawikiversity", - "Book", - "c", - "Commons", - "d", - "dbdump", - "download", - "Draft", - "Education", - "Foundation", - "Gadget", - "Gadget definition", - "Gebruiker", - "gerrit", - "Help", - "Image", - "Incubator", - "m", - "mail", - "mailarchive", - "media", - "MediaWiki", - "MediaWiki talk", - "Mediawikiwiki", - "MediaZilla", - "Meta", - "Metawikipedia", - "Module", - "mw", - "n", - "nost", - "oldwikisource", - "otrs", - "OTRSwiki", - "Overleg gebruiker", - "outreach", - "outreachwiki", - "Portal", - "phab", - "Phabricator", - "Project", - "q", - "quality", - "rev", - "s", - "spcom", - "Special", - "species", - "Strategy", - "sulutil", - "svn", - "Talk", - "Template", - "Template talk", - "Testwiki", - "ticket", - "TimedText", - "Toollabs", - "tools", - "tswiki", - "User", - "User talk", - "v", - "voy", - "w", - "Wikibooks", - "Wikidata", - "wikiHow", - "Wikinvest", - "wikilivres", - "Wikimedia", - "Wikinews", - "Wikipedia", - "Wikipedia talk", - "Wikiquote", - "Wikisource", - "Wikispecies", - "Wikitech", - "Wikiversity", - "Wikivoyage", - "wikt", - "wiktionary", - "wmf", - "wmania", - "WP", - ] -) diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py deleted file mode 100644 index 003074feb..000000000 --- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py +++ /dev/null @@ -1,179 +0,0 @@ -# coding: utf-8 -"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB) -with specific parameters. Intermediate files are written to disk. - -Running the full pipeline on a standard laptop, may take up to 13 hours of processing. -Use the -p, -d and -s options to speed up processing using the intermediate files -from a previous run. - -For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ -For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2 -from https://dumps.wikimedia.org/enwiki/latest/ - -""" -from __future__ import unicode_literals - -import logging -from pathlib import Path -import plac - -from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd -from bin.wiki_entity_linking import wiki_io as io -from bin.wiki_entity_linking import kb_creator -from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT -from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH -import spacy -from bin.wiki_entity_linking.kb_creator import read_kb - -logger = logging.getLogger(__name__) - - -@plac.annotations( - wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path), - wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path), - output_dir=("Output directory", "positional", None, Path), - model=("Model name or path, should include pretrained vectors.", "positional", None, str), - max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int), - min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int), - min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int), - entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int), - loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path), - loc_entity_defs=("Location to file with entity definitions", "option", "d", Path), - loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path), - descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"), - limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int), - limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int), - limit_wd=("Threshold to limit lines read from WD", "option", "lw", int), - lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str), -) -def main( - wd_json, - wp_xml, - output_dir, - model, - max_per_alias=10, - min_freq=20, - min_pair=5, - entity_vector_length=64, - loc_prior_prob=None, - loc_entity_defs=None, - loc_entity_alias=None, - loc_entity_desc=None, - descr_from_wp=False, - limit_prior=None, - limit_train=None, - limit_wd=None, - lang="en", -): - entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH - entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH - entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH - entity_freq_path = output_dir / ENTITY_FREQ_PATH - prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH - training_entities_path = output_dir / TRAINING_DATA_FILE - kb_path = output_dir / KB_FILE - - logger.info("Creating KB with Wikipedia and WikiData") - - # STEP 0: set up IO - if not output_dir.exists(): - output_dir.mkdir(parents=True) - - # STEP 1: Load the NLP object - logger.info("STEP 1: Loading NLP model {}".format(model)) - nlp = spacy.load(model) - - # check the length of the nlp vectors - if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - # STEP 2: create prior probabilities from WP - if not prior_prob_path.exists(): - # It takes about 2h to process 1000M lines of Wikipedia XML dump - logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path)) - if limit_prior is not None: - logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior)) - wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior) - else: - logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path)) - - # STEP 3: calculate entity frequencies - if not entity_freq_path.exists(): - logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path)) - io.write_entity_to_count(prior_prob_path, entity_freq_path) - else: - logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path)) - - # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file - if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()): - # It takes about 10h to process 55M lines of Wikidata JSON dump - logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path)) - if limit_wd is not None: - logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd)) - title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json( - wd_json, - limit_wd, - to_print=False, - lang=lang, - parse_descr=(not descr_from_wp), - ) - io.write_title_to_id(entity_defs_path, title_to_id) - - logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path)) - io.write_id_to_alias(entity_alias_path, id_to_alias) - - if not descr_from_wp: - logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path)) - io.write_id_to_descr(entity_descr_path, id_to_descr) - else: - logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path)) - logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path)) - if not descr_from_wp: - logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path)) - - # STEP 5: Getting gold entities from Wikipedia - if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()): - logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path)) - if limit_train is not None: - logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train)) - wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path, - training_entities_path, descr_from_wp, limit_train) - if descr_from_wp: - logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path)) - else: - logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path)) - if descr_from_wp: - logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path)) - - # STEP 6: creating the actual KB - # It takes ca. 30 minutes to pretrain the entity embeddings - if not kb_path.exists(): - logger.info("STEP 6: Creating the KB at {}".format(kb_path)) - kb = kb_creator.create_kb( - nlp=nlp, - max_entities_per_alias=max_per_alias, - min_entity_freq=min_freq, - min_occ=min_pair, - entity_def_path=entity_defs_path, - entity_descr_path=entity_descr_path, - entity_alias_path=entity_alias_path, - entity_freq_path=entity_freq_path, - prior_prob_path=prior_prob_path, - entity_vector_length=entity_vector_length, - ) - kb.dump(kb_path) - logger.info("kb entities: {}".format(kb.get_size_entities())) - logger.info("kb aliases: {}".format(kb.get_size_aliases())) - nlp.to_disk(output_dir / KB_MODEL_DIR) - else: - logger.info("STEP 6: KB already exists at {}".format(kb_path)) - - logger.info("Done!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) - plac.call(main) diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py deleted file mode 100644 index 8a070f567..000000000 --- a/bin/wiki_entity_linking/wikidata_processor.py +++ /dev/null @@ -1,154 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import bz2 -import json -import logging - -from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS - -logger = logging.getLogger(__name__) - - -def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True): - # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines. - # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ - - site_filter = '{}wiki'.format(lang) - - # filter: currently defined as OR: one hit suffices to be removed from further processing - exclude_list = WD_META_ITEMS - - # punctuation - exclude_list.extend(["Q1383557", "Q10617810"]) - - # letters etc - exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"]) - - neg_prop_filter = { - 'P31': exclude_list, # instance of - 'P279': exclude_list # subclass - } - - title_to_id = dict() - id_to_descr = dict() - id_to_alias = dict() - - # parse appropriate fields - depending on what we need in the KB - parse_properties = False - parse_sitelinks = True - parse_labels = False - parse_aliases = True - parse_claims = True - - with bz2.open(wikidata_file, mode='rb') as file: - for cnt, line in enumerate(file): - if limit and cnt >= limit: - break - if cnt % 500000 == 0 and cnt > 0: - logger.info("processed {} lines of WikiData JSON dump".format(cnt)) - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - obj = json.loads(clean_line) - entry_type = obj["type"] - - if entry_type == "item": - keep = True - - claims = obj["claims"] - if parse_claims: - for prop, value_set in neg_prop_filter.items(): - claim_property = claims.get(prop, None) - if claim_property: - for cp in claim_property: - cp_id = ( - cp["mainsnak"] - .get("datavalue", {}) - .get("value", {}) - .get("id") - ) - cp_rank = cp["rank"] - if cp_rank != "deprecated" and cp_id in value_set: - keep = False - - if keep: - unique_id = obj["id"] - - if to_print: - print("ID:", unique_id) - print("type:", entry_type) - - # parsing all properties that refer to other entities - if parse_properties: - for prop, claim_property in claims.items(): - cp_dicts = [ - cp["mainsnak"]["datavalue"].get("value") - for cp in claim_property - if cp["mainsnak"].get("datavalue") - ] - cp_values = [ - cp_dict.get("id") - for cp_dict in cp_dicts - if isinstance(cp_dict, dict) - if cp_dict.get("id") is not None - ] - if cp_values: - if to_print: - print("prop:", prop, cp_values) - - found_link = False - if parse_sitelinks: - site_value = obj["sitelinks"].get(site_filter, None) - if site_value: - site = site_value["title"] - if to_print: - print(site_filter, ":", site) - title_to_id[site] = unique_id - found_link = True - - if parse_labels: - labels = obj["labels"] - if labels: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print( - "label (" + lang + "):", lang_label["value"] - ) - - if found_link and parse_descr: - descriptions = obj["descriptions"] - if descriptions: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print( - "description (" + lang + "):", - lang_descr["value"], - ) - id_to_descr[unique_id] = lang_descr["value"] - - if parse_aliases: - aliases = obj["aliases"] - if aliases: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print( - "alias (" + lang + "):", item["value"] - ) - alias_list = id_to_alias.get(unique_id, []) - alias_list.append(item["value"]) - id_to_alias[unique_id] = alias_list - - if to_print: - print() - - # log final number of lines processed - logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt)) - return title_to_id, id_to_descr, id_to_alias - - diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py deleted file mode 100644 index 54f00fc6f..000000000 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ /dev/null @@ -1,172 +0,0 @@ -# coding: utf-8 -"""Script that takes a previously created Knowledge Base and trains an entity linking -pipeline. The provided KB directory should hold the kb, the original nlp object and -its vocab used to create the KB, and a few auxiliary files such as the entity definitions, -as created by the script `wikidata_create_kb`. - -For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2 -from https://dumps.wikimedia.org/enwiki/latest/ -""" -from __future__ import unicode_literals - -import random -import logging -import spacy -from pathlib import Path -import plac -from tqdm import tqdm - -from bin.wiki_entity_linking import wikipedia_processor -from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR -from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance -from bin.wiki_entity_linking.kb_creator import read_kb - -from spacy.util import minibatch, compounding - -logger = logging.getLogger(__name__) - - -@plac.annotations( - dir_kb=("Directory with KB, NLP and related files", "positional", None, Path), - output_dir=("Output directory", "option", "o", Path), - loc_training=("Location to training data", "option", "k", Path), - epochs=("Number of training iterations (default 10)", "option", "e", int), - dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float), - lr=("Learning rate (default 0.005)", "option", "n", float), - l2=("L2 regularization", "option", "r", float), - train_articles=("# training articles (default 90% of all)", "option", "t", int), - dev_articles=("# dev test articles (default 10% of all)", "option", "d", int), - labels_discard=("NER labels to discard (default None)", "option", "l", str), -) -def main( - dir_kb, - output_dir=None, - loc_training=None, - epochs=10, - dropout=0.5, - lr=0.005, - l2=1e-6, - train_articles=None, - dev_articles=None, - labels_discard=None -): - if not output_dir: - logger.warning("No output dir specified so no results will be written, are you sure about this ?") - - logger.info("Creating Entity Linker with Wikipedia and WikiData") - - output_dir = Path(output_dir) if output_dir else dir_kb - training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE - nlp_dir = dir_kb / KB_MODEL_DIR - kb_path = dir_kb / KB_FILE - nlp_output_dir = output_dir / OUTPUT_MODEL_DIR - - # STEP 0: set up IO - if not output_dir.exists(): - output_dir.mkdir() - - # STEP 1 : load the NLP object - logger.info("STEP 1a: Loading model from {}".format(nlp_dir)) - nlp = spacy.load(nlp_dir) - logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names)) - - # check that there is a NER component in the pipeline - if "ner" not in nlp.pipe_names: - raise ValueError("The `nlp` object should have a pretrained `ner` component.") - - logger.info("STEP 1b: Loading KB from {}".format(kb_path)) - kb = read_kb(nlp, kb_path) - - # STEP 2: read the training dataset previously created from WP - logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path)) - train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path) - logger.info("Training set has {} articles, limit set to roughly {} articles per epoch" - .format(len(train_indices), train_articles if train_articles else "all")) - logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation" - .format(len(dev_indices), dev_articles if dev_articles else "all")) - if dev_articles: - dev_indices = dev_indices[0:dev_articles] - - # STEP 3: create and train an entity linking pipe - logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs)) - if labels_discard: - labels_discard = [x.strip() for x in labels_discard.split(",")] - logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard)) - else: - labels_discard = [] - - el_pipe = nlp.create_pipe( - name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name, - "labels_discard": labels_discard} - ) - el_pipe.set_kb(kb) - nlp.add_pipe(el_pipe, last=True) - - other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] - with nlp.disable_pipes(*other_pipes): # only train Entity Linking - optimizer = nlp.begin_training() - optimizer.learn_rate = lr - optimizer.L2 = l2 - - logger.info("Dev Baseline Accuracies:") - dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=True, line_ids=dev_indices, - kb=kb, labels_discard=labels_discard) - - measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices)) - - for itn in range(epochs): - random.shuffle(train_indices) - losses = {} - batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001)) - batchnr = 0 - articles_processed = 0 - - # we either process the whole training file, or just a part each epoch - bar_total = len(train_indices) - if train_articles: - bar_total = train_articles - - with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar: - for batch in batches: - if not train_articles or articles_processed < train_articles: - with nlp.disable_pipes("entity_linker"): - train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=False, line_ids=batch, - kb=kb, labels_discard=labels_discard) - docs, golds = zip(*train_batch) - try: - with nlp.disable_pipes(*other_pipes): - nlp.update( - docs=docs, - golds=golds, - sgd=optimizer, - drop=dropout, - losses=losses, - ) - batchnr += 1 - articles_processed += len(docs) - pbar.update(len(docs)) - except Exception as e: - logger.error("Error updating batch:" + str(e)) - if batchnr > 0: - logging.info("Epoch {} trained on {} articles, train loss {}" - .format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2))) - # re-read the dev_data (data is returned as a generator) - dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=True, line_ids=dev_indices, - kb=kb, labels_discard=labels_discard) - measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices)) - - if output_dir: - # STEP 4: write the NLP pipeline (now including an EL model) to file - logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names)) - logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir)) - nlp.to_disk(nlp_output_dir) - - logger.info("Done!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) - plac.call(main) diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py deleted file mode 100644 index 649d48fe5..000000000 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ /dev/null @@ -1,565 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import bz2 -import logging -import random -import json - -from spacy.gold import GoldParse -from bin.wiki_entity_linking import wiki_io as io -from bin.wiki_entity_linking.wiki_namespaces import ( - WP_META_NAMESPACE, - WP_FILE_NAMESPACE, - WP_CATEGORY_NAMESPACE, -) - -""" -Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions. -Write these results to file for downstream KB and training data generation. - -Process Wikipedia interlinks to generate a training dataset for the EL algorithm. -""" - -ENTITY_FILE = "gold_entities.csv" - -map_alias_to_link = dict() - -logger = logging.getLogger(__name__) - -title_regex = re.compile(r"(?<=).*(?=)") -id_regex = re.compile(r"(?<=)\d*(?=)") -text_tag_regex = re.compile(r"(?<=)") -text_regex = re.compile(r"(?<=).*(?= 0: - logger.info("processed {} lines of Wikipedia XML dump".format(cnt)) - clean_line = line.strip().decode("utf-8") - - # we attempt at reading the article's ID (but not the revision or contributor ID) - if "" in clean_line or "" in clean_line: - read_id = False - if "" in clean_line: - read_id = True - - if read_id: - ids = id_regex.search(clean_line) - if ids: - current_article_id = ids[0] - - # only processing prior probabilities from true training (non-dev) articles - if not is_dev(current_article_id): - aliases, entities, normalizations = get_wp_links(clean_line) - for alias, entity, norm in zip(aliases, entities, normalizations): - _store_alias( - alias, entity, normalize_alias=norm, normalize_entity=True - ) - - line = file.readline() - cnt += 1 - logger.info("processed {} lines of Wikipedia XML dump".format(cnt)) - logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt)) - - # write all aliases and their entities and count occurrences to file - with prior_prob_output.open("w", encoding="utf8") as outputfile: - outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") - for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): - s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True) - for entity, count in s_dict: - outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") - - -def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): - alias = alias.strip() - entity = entity.strip() - - # remove everything after # as this is not part of the title but refers to a specific paragraph - if normalize_entity: - # wikipedia titles are always capitalized - entity = _capitalize_first(entity.split("#")[0]) - if normalize_alias: - alias = alias.split("#")[0] - - if alias and entity: - alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(entity, 0) - alias_dict[entity] = entity_count + 1 - map_alias_to_link[alias] = alias_dict - - -def get_wp_links(text): - aliases = [] - entities = [] - normalizations = [] - - matches = link_regex.findall(text) - for match in matches: - match = match[2:][:-2].replace("_", " ").strip() - - if ns_regex.match(match): - pass # ignore the entity if it points to a "meta" page - - # this is a simple [[link]], with the alias the same as the mention - elif "|" not in match: - aliases.append(match) - entities.append(match) - normalizations.append(True) - - # in wiki format, the link is written as [[entity|alias]] - else: - splits = match.split("|") - entity = splits[0].strip() - alias = splits[1].strip() - # specific wiki format [[alias (specification)|]] - if len(alias) == 0 and "(" in entity: - alias = entity.split("(")[0] - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - else: - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - - return aliases, entities, normalizations - - -def _capitalize_first(text): - if not text: - return None - result = text[0].capitalize() - if len(result) > 0: - result += text[1:] - return result - - -def create_training_and_desc( - wp_input, def_input, desc_output, training_output, parse_desc, limit=None -): - wp_to_id = io.read_title_to_id(def_input) - _process_wikipedia_texts( - wp_input, wp_to_id, desc_output, training_output, parse_desc, limit - ) - - -def _process_wikipedia_texts( - wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None -): - """ - Read the XML wikipedia data to parse out training data: - raw text data + positive instances - """ - - read_ids = set() - - with output.open("a", encoding="utf8") as descr_file, training_output.open( - "w", encoding="utf8" - ) as entity_file: - if parse_descriptions: - _write_training_description(descr_file, "WD_id", "description") - with bz2.open(wikipedia_input, mode="rb") as file: - article_count = 0 - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - for line in file: - clean_line = line.strip().decode("utf-8") - - if clean_line == "": - reading_revision = True - elif clean_line == "": - reading_revision = False - - # Start reading new page - if clean_line == "": - article_text = "" - article_title = None - article_id = None - # finished reading this page - elif clean_line == "": - if article_id: - clean_text, entities = _process_wp_text( - article_title, article_text, wp_to_id - ) - if clean_text is not None and entities is not None: - _write_training_entities( - entity_file, article_id, clean_text, entities - ) - - if article_title in wp_to_id and parse_descriptions: - description = " ".join( - clean_text[:1000].split(" ")[:-1] - ) - _write_training_description( - descr_file, wp_to_id[article_title], description - ) - article_count += 1 - if article_count % 10000 == 0 and article_count > 0: - logger.info( - "Processed {} articles".format(article_count) - ) - if limit and article_count >= limit: - break - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - # start reading text within a page - if "") - clean_text = clean_text.replace(r""", '"') - clean_text = clean_text.replace(r"&nbsp;", " ") - clean_text = clean_text.replace(r"&", "&") - - # remove multiple spaces - while " " in clean_text: - clean_text = clean_text.replace(" ", " ") - - return clean_text.strip() - - -def _remove_links(clean_text, wp_to_id): - # read the text char by char to get the right offsets for the interwiki links - entities = [] - final_text = "" - open_read = 0 - reading_text = True - reading_entity = False - reading_mention = False - reading_special_case = False - entity_buffer = "" - mention_buffer = "" - for index, letter in enumerate(clean_text): - if letter == "[": - open_read += 1 - elif letter == "]": - open_read -= 1 - elif letter == "|": - if reading_text: - final_text += letter - # switch from reading entity to mention in the [[entity|mention]] pattern - elif reading_entity: - reading_text = False - reading_entity = False - reading_mention = True - else: - reading_special_case = True - else: - if reading_entity: - entity_buffer += letter - elif reading_mention: - mention_buffer += letter - elif reading_text: - final_text += letter - else: - raise ValueError("Not sure at point", clean_text[index - 2 : index + 2]) - - if open_read > 2: - reading_special_case = True - - if open_read == 2 and reading_text: - reading_text = False - reading_entity = True - reading_mention = False - - # we just finished reading an entity - if open_read == 0 and not reading_text: - if "#" in entity_buffer or entity_buffer.startswith(":"): - reading_special_case = True - # Ignore cases with nested structures like File: handles etc - if not reading_special_case: - if not mention_buffer: - mention_buffer = entity_buffer - start = len(final_text) - end = start + len(mention_buffer) - qid = wp_to_id.get(entity_buffer, None) - if qid: - entities.append((mention_buffer, qid, start, end)) - final_text += mention_buffer - - entity_buffer = "" - mention_buffer = "" - - reading_text = True - reading_entity = False - reading_mention = False - reading_special_case = False - return final_text, entities - - -def _write_training_description(outputfile, qid, description): - if description is not None: - line = str(qid) + "|" + description + "\n" - outputfile.write(line) - - -def _write_training_entities(outputfile, article_id, clean_text, entities): - entities_data = [ - {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]} - for ent in entities - ] - line = ( - json.dumps( - { - "article_id": article_id, - "clean_text": clean_text, - "entities": entities_data, - }, - ensure_ascii=False, - ) - + "\n" - ) - outputfile.write(line) - - -def read_training_indices(entity_file_path): - """ This method creates two lists of indices into the training file: one with indices for the - training examples, and one for the dev examples.""" - train_indices = [] - dev_indices = [] - - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): - example = json.loads(line) - article_id = example["article_id"] - clean_text = example["clean_text"] - - if is_valid_article(clean_text): - if is_dev(article_id): - dev_indices.append(i) - else: - train_indices.append(i) - - return train_indices, dev_indices - - -def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None): - """ This method provides training/dev examples that correspond to the entity annotations found by the nlp object. - For training, it will include both positive and negative examples by using the candidate generator from the kb. - For testing (kb=None), it will include all positive examples only.""" - if not labels_discard: - labels_discard = [] - - max_index = max(line_ids) - - with entity_file_path.open("r", encoding="utf8") as _file: - line = _file.readline() - i = 0 - while line and i < max_index: - if i in line_ids: - example = json.loads(line) - article_id = example["article_id"] - clean_text = example["clean_text"] - entities = example["entities"] - - if dev != is_dev(article_id) or not is_valid_article(clean_text): - continue - - doc = nlp(clean_text) - gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) - if gold and len(gold.links) > 0: - yield doc, gold - i += 1 - line = _file.readline() - - -def _get_gold_parse(doc, entities, dev, kb, labels_discard): - gold_entities = {} - tagged_ent_positions = { - (ent.start_char, ent.end_char): ent - for ent in doc.ents - if ent.label_ not in labels_discard - } - - for entity in entities: - entity_id = entity["entity"] - alias = entity["alias"] - start = entity["start"] - end = entity["end"] - - candidate_ids = [] - if kb and not dev: - candidates = kb.get_candidates(alias) - candidate_ids = [cand.entity_ for cand in candidates] - - tagged_ent = tagged_ent_positions.get((start, end), None) - if tagged_ent: - # TODO: check that alias == doc.text[start:end] - should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence( - tagged_ent.sent.text - ) - - if should_add_ent: - value_by_id = {entity_id: 1.0} - if not dev: - random.shuffle(candidate_ids) - value_by_id.update( - {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id} - ) - gold_entities[(start, end)] = value_by_id - - return GoldParse(doc, links=gold_entities) - - -def is_dev(article_id): - if not article_id: - return False - return article_id.endswith("3") - - -def is_valid_article(doc_text): - # custom length cut-off - return 10 < len(doc_text) < 30000 - - -def is_valid_sentence(sent_text): - if not 10 < len(sent_text) < 3000: - # custom length cut-off - return False - - if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"): - # remove 'enumeration' sentences (occurs often on Wikipedia) - return False - - return True diff --git a/examples/training/pretrain_kb.py b/examples/training/create_kb.py similarity index 75% rename from examples/training/pretrain_kb.py rename to examples/training/create_kb.py index 54c68f653..cbdb5c05b 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/create_kb.py @@ -1,15 +1,15 @@ #!/usr/bin/env python # coding: utf8 -"""Example of defining and (pre)training spaCy's knowledge base, +"""Example of defining a knowledge base in spaCy, which is needed to implement entity linking functionality. For more details, see the documentation: * Knowledge base: https://spacy.io/api/kb * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2.3 -Last tested with: v2.2.3 +Compatible with: spaCy v2.2.4 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function @@ -20,24 +20,18 @@ from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase -from bin.wiki_entity_linking.train_descriptions import EntityEncoder - # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} -INPUT_DIM = 300 # dimension of pretrained input vectors -DESC_WIDTH = 64 # dimension of output entity vectors - @plac.annotations( model=("Model name, should have pretrained word embeddings", "positional", None, str), output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), ) -def main(model=None, output_dir=None, n_iter=50): - """Load the model, create the KB and pretrain the entity encodings. +def main(model=None, output_dir=None): + """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" @@ -51,33 +45,23 @@ def main(model=None, output_dir=None, n_iter=50): " cf. https://spacy.io/usage/models#languages." ) - kb = KnowledgeBase(vocab=nlp.vocab) + # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. + # For simplicity, we'll just use the original vector dimension here instead. + vectors_dim = nlp.vocab.vectors.shape[1] + kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] - descriptions = [] + descr_embeddings = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) - descriptions.append(desc) + descr_embeddings.append(nlp(desc).vector) freqs.append(freq) - # training entity description encodings - # this part can easily be replaced with a custom entity encoder - encoder = EntityEncoder( - nlp=nlp, - input_dim=INPUT_DIM, - desc_width=DESC_WIDTH, - epochs=n_iter, - ) - encoder.train(description_list=descriptions, to_print=True) - - # get the pretrained entity vectors - embeddings = encoder.apply_encoder(descriptions) - # set the entities, can also be done by calling `kb.add_entity` for each entity - kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) + kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( @@ -113,8 +97,8 @@ def main(model=None, output_dir=None, n_iter=50): vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) - _print_kb(kb2) print() + _print_kb(kb2) def _print_kb(kb): @@ -126,6 +110,5 @@ if __name__ == "__main__": plac.call(main) # Expected output: - # 2 kb entities: ['Q2146908', 'Q7381115'] # 1 kb aliases: ['Russ Cochran'] diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index dd7c3a1b2..c7eba8a30 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -1,15 +1,15 @@ #!/usr/bin/env python # coding: utf8 -"""Example of training spaCy's entity linker, starting off with an -existing model and a pre-defined knowledge base. +"""Example of training spaCy's entity linker, starting off with a predefined +knowledge base and corresponding vocab, and a blank English model. For more details, see the documentation: * Training: https://spacy.io/usage/training * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2.3 -Last tested with: v2.2.3 +Compatible with: spaCy v2.2.4 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function @@ -17,13 +17,11 @@ import plac import random from pathlib import Path -from spacy.symbols import PERSON from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase from spacy.pipeline import EntityRuler -from spacy.tokens import Span from spacy.util import minibatch, compounding diff --git a/spacy/tests/regression/test_issue5314.py b/spacy/tests/regression/test_issue5314.py deleted file mode 100644 index 5bb817d5c..000000000 --- a/spacy/tests/regression/test_issue5314.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest - -from bin.wiki_entity_linking.wikipedia_processor import _process_wp_text - -old_format_text = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" -new_format_text = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" -potential_future_format = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" - - -@pytest.mark.parametrize( - "text", [old_format_text, new_format_text, potential_future_format] -) -def test_issue5314(text): - title = "Arkæologi" - clean_text, _ = _process_wp_text(title, text, {}) - - expected_text = "Arkæologi er studiet af tidligere tiders menneskelige aktivitet, primært gennem studiet af menneskets materielle levn." - assert clean_text.strip() == expected_text diff --git a/website/docs/usage/examples.md b/website/docs/usage/examples.md index 96dc7627d..854b2d42b 100644 --- a/website/docs/usage/examples.md +++ b/website/docs/usage/examples.md @@ -111,6 +111,27 @@ start. https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py ``` +### Creating a Knowledge Base for Named Entity Linking {#kb} + +This example shows how to create a knowledge base in spaCy, +which is needed to implement entity linking functionality. +It requires as input a spaCy model with pretrained word vectors, +and it stores the KB to file (if an `output_dir` is provided). + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py +``` + +### Training spaCy's Named Entity Linker {#nel} + +This example shows how to train spaCy's entity linker with your own custom +examples, starting off with a predefined knowledge base and its vocab, +and using a blank `English` class. + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py +``` + ### Training spaCy's Dependency Parser {#parser} This example shows how to update spaCy's dependency parser, starting off with an diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 59712939a..d17e5a661 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -579,9 +579,7 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' To ground the named entities into the "real world", spaCy provides functionality to perform entity linking, which resolves a textual entity to a unique -identifier from a knowledge base (KB). The -[processing scripts](https://github.com/explosion/spaCy/tree/master/bin/wiki_entity_linking) -we provide use WikiData identifiers, but you can create your own +identifier from a knowledge base (KB). You can create your own [`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using that custom-made KB. diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 479441edf..ecdc6720b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -347,9 +347,9 @@ your data** to find a solution that works best for you. ### Updating the Named Entity Recognizer {#example-train-ner} This example shows how to update spaCy's entity recognizer with your own -examples, starting off with an existing, pretrained model, or from scratch -using a blank `Language` class. To do this, you'll need **example texts** and -the **character offsets** and **labels** of each entity contained in the texts. +examples, starting off with an existing, pretrained model, or from scratch using +a blank `Language` class. To do this, you'll need **example texts** and the +**character offsets** and **labels** of each entity contained in the texts. ```python https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py @@ -440,8 +440,8 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py training the parser. 2. **Add the dependency labels** to the parser using the [`add_label`](/api/dependencyparser#add_label) method. If you're starting off - with a pretrained spaCy model, this is usually not necessary – but it - doesn't hurt either, just to be safe. + with a pretrained spaCy model, this is usually not necessary – but it doesn't + hurt either, just to be safe. 3. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through the words of the input. At each word, it makes a **prediction**. It then @@ -605,16 +605,16 @@ To train an entity linking model, you first need to define a knowledge base A KB consists of a list of entities with unique identifiers. Each such entity has an entity vector that will be used to measure similarity with the context in -which an entity is used. These vectors are pretrained and stored in the KB -before the entity linking model will be trained. +which an entity is used. These vectors have a fixed length and are stored in the +KB. The following example shows how to build a knowledge base from scratch, given a -list of entities and potential aliases. The script further demonstrates how to -pretrain and store the entity vectors. To run this example, the script needs -access to a `vocab` instance or an `nlp` model with pretrained word embeddings. +list of entities and potential aliases. The script requires an `nlp` model with +pretrained word vectors to obtain an encoding of an entity's description as its +vector. ```python -https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py +https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py ``` #### Step by step guide {#step-by-step-kb} From 63885c1836c219745e2fccc8ecacd2f357aa0341 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Apr 2020 12:54:57 +0200 Subject: [PATCH 074/131] Remove u string and auto-format [ci skip] --- website/meta/universe.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index f9638279c..50977b39c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -364,7 +364,7 @@ "entity = Entity(keywords_list=['python', 'java platform'])", "nlp.add_pipe(entity, last=True)", "", - "doc = nlp(u\"I am a product manager for a java and python.\")", + "doc = nlp(\"I am a product manager for a java and python.\")", "assert doc._.has_entities == True", "assert doc[2:5]._.has_entities == True", "assert doc[0]._.is_entity == False", @@ -1653,10 +1653,10 @@ "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy.", "thumb": "https://i.imgur.com/ggVI02O.jpg", "image": "https://i.imgur.com/z1yhWQR.jpg", - "url": "https://telegram.me/pic2phrase_bot", + "url": "https://telegram.me/pic2phrase_bot", "author": "Yuli Vasiliev", "author_links": { - "twitter": "VasilievYuli", + "twitter": "VasilievYuli" }, "category": ["standalone", "conversational"] }, From cfdaf99b8029d6762730c5d5bd2b6f6c173c1241 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Apr 2020 12:56:17 +0200 Subject: [PATCH 075/131] Fix passing of component configuration (#5374) * add kwargs to to_disk methods in docs - otherwise crashes on 'exclude' argument * add fix and test for Issue 5137 --- spacy/tests/regression/test_issue5137.py | 33 ++++++++++++++++++++++++ spacy/util.py | 1 + website/docs/usage/saving-loading.md | 4 +-- 3 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/regression/test_issue5137.py diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py new file mode 100644 index 000000000..4b4e597d3 --- /dev/null +++ b/spacy/tests/regression/test_issue5137.py @@ -0,0 +1,33 @@ +import spacy +from spacy.language import Language +from spacy.lang.en import English +from spacy.tests.util import make_tempdir + + +def test_issue5137(): + class MyComponent(object): + name = "my_component" + + def __init__(self, nlp, **cfg): + self.nlp = nlp + self.categories = cfg.get("categories", "all_categories") + + def __call__(self, doc): + pass + + def to_disk(self, path, **kwargs): + pass + + def from_disk(self, path, **cfg): + pass + + Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg) + + nlp = English() + nlp.add_pipe(nlp.create_pipe("my_component")) + assert nlp.get_pipe("my_component").categories == "all_categories" + + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + nlp2 = spacy.load(tmpdir, categories="my_categories") + assert nlp2.get_pipe("my_component").categories == "my_categories" diff --git a/spacy/util.py b/spacy/util.py index 609c0b572..d4cdca4e0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -208,6 +208,7 @@ def load_model_from_path(model_path, meta=False, **overrides): for name in pipeline: if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) + config.update(overrides) factory = factories.get(name, name) component = nlp.create_pipe(factory, config=config) nlp.add_pipe(component, name=name) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 8e2c30d82..76a9773f6 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -216,7 +216,7 @@ class CustomComponent(object): # Add something to the component's data self.data.append(data) - def to_disk(self, path): + def to_disk(self, path, **kwargs): # This will receive the directory path + /my_component data_path = path / "data.json" with data_path.open("w", encoding="utf8") as f: @@ -461,7 +461,7 @@ model. When you save out a model using `nlp.to_disk` and the component exposes a `to_disk` method, it will be called with the disk path. ```python -def to_disk(self, path): +def to_disk(self, path, **kwargs): snek_path = path / "snek.txt" with snek_path.open("w", encoding="utf8") as snek_file: snek_file.write(self.snek) From bdff76deded8380c68b8cc209f60c1dea3034cf3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:56:46 +0200 Subject: [PATCH 076/131] Various updates/additions to CLI scripts (#5362) * `debug-data`: determine coverage of provided vectors * `evaluate`: support `blank:lg` model to make it possible to just evaluate tokenization * `init-model`: add option to truncate vectors to N most frequent vectors from word2vec file * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status --- spacy/cli/debug_data.py | 31 ++++++++++++++++++++++++++----- spacy/cli/evaluate.py | 6 +++++- spacy/cli/init_model.py | 19 +++++++++++++++---- spacy/cli/train.py | 36 ++++++++++++++++++++---------------- website/docs/api/cli.md | 3 ++- 5 files changed, 68 insertions(+), 27 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c5e1ff6cf..279f34f16 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -108,9 +108,11 @@ def debug_data( msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly - gold_train_data = _compile_gold(train_docs, pipeline) - gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) - gold_dev_data = _compile_gold(dev_docs, pipeline) + gold_train_data = _compile_gold(train_docs, pipeline, nlp) + gold_train_unpreprocessed_data = _compile_gold( + train_docs_unpreprocessed, pipeline, nlp + ) + gold_dev_data = _compile_gold(dev_docs, pipeline, nlp) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] @@ -182,6 +184,16 @@ def debug_data( nlp.vocab.vectors_length, ) ) + n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) + msg.warn( + "{} words in training data without vectors ({:0.2f}%)".format( + n_missing_vectors, + n_missing_vectors / gold_train_data["n_words"], + ), + ) + msg.text( + "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose, + ) else: msg.info("No word vectors present in the model") @@ -562,7 +574,7 @@ def _load_file(file_path, msg): ) -def _compile_gold(train_docs, pipeline): +def _compile_gold(train_docs, pipeline, nlp): data = { "ner": Counter(), "cats": Counter(), @@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline): "punct_ents": 0, "n_words": 0, "n_misaligned_words": 0, + "words_missing_vectors": Counter(), "n_sents": 0, "n_nonproj": 0, "n_cycles": 0, @@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline): data["n_words"] += len(valid_words) data["n_misaligned_words"] += len(gold.words) - len(valid_words) data["texts"].add(doc.text) + if len(nlp.vocab.vectors): + for word in valid_words: + if nlp.vocab.strings[word] not in nlp.vocab.vectors: + data["words_missing_vectors"].update([word]) if "ner" in pipeline: for i, label in enumerate(gold.ner): if label is None: @@ -636,7 +653,11 @@ def _format_labels(labels, counts=False): def _get_examples_without_label(data, label): count = 0 for doc, gold in data: - labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")] + labels = [ + label.split("-")[1] + for label in gold.ner + if label is not None and label not in ("O", "-") + ] if label not in labels: count += 1 return count diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index c24e37038..8a84684e5 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, division, print_function import plac +import spacy from timeit import default_timer as timer from wasabi import msg @@ -43,7 +44,10 @@ def evaluate( if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) - nlp = util.load_model(model) + if model.startswith("blank:"): + nlp = spacy.blank(model.replace("blank:", "")) + else: + nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_docs, verbose=False) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 2e0aeb239..31d627e9b 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20 jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path), clusters_loc=("Optional location of brown clusters data", "option", "c", str), vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), + truncate_vectors=( + "Optional number of vectors to truncate to when reading in vectors file", + "option", + "t", + int, + ), prune_vectors=("Optional number of vectors to prune to", "option", "V", int), vectors_name=( "Optional name for the word vectors, e.g. en_core_web_lg.vectors", @@ -51,6 +57,7 @@ def init_model( clusters_loc=None, jsonl_loc=None, vectors_loc=None, + truncate_vectors=0, prune_vectors=-1, vectors_name=None, model_name=None, @@ -88,7 +95,7 @@ def init_model( nlp = create_model(lang, lex_attrs, name=model_name) msg.good("Successfully created model") if vectors_loc is not None: - add_vectors(nlp, vectors_loc, prune_vectors, vectors_name) + add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( @@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None): return nlp -def add_vectors(nlp, vectors_loc, prune_vectors, name=None): +def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): else: if vectors_loc: with msg.loading("Reading vectors from {}".format(vectors_loc)): - vectors_data, vector_keys = read_vectors(vectors_loc) + vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors) msg.good("Loaded vectors from {}".format(vectors_loc)) else: vectors_data, vector_keys = (None, None) @@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): nlp.vocab.prune_vectors(prune_vectors) -def read_vectors(vectors_loc): +def read_vectors(vectors_loc, truncate_vectors=0): f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) vectors_data = numpy.zeros(shape=shape, dtype="f") vectors_keys = [] for i, line in enumerate(tqdm(f)): @@ -212,6 +221,8 @@ def read_vectors(vectors_loc): msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) vectors_data[i] = numpy.asarray(pieces, dtype="f") vectors_keys.append(word) + if i == truncate_vectors - 1: + break return vectors_data, vectors_keys diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8fc475d24..db58b22df 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -454,22 +454,25 @@ def train( cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) - with Model.use_device("cpu"): - nlp_loaded = util.load_model_from_path(epoch_model_path) - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_docs = list( - corpus.dev_docs( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, + # Only evaluate on CPU in the first iteration (for + # timing) if GPU is enabled + if i >= 1: + with Model.use_device("cpu"): + nlp_loaded = util.load_model_from_path(epoch_model_path) + for name, component in nlp_loaded.pipeline: + if hasattr(component, "cfg"): + component.cfg["beam_width"] = beam_width + dev_docs = list( + corpus.dev_docs( + nlp_loaded, + gold_preproc=gold_preproc, + ignore_misaligned=True, + ) ) - ) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) - end_time = timer() - cpu_wps = nwords / (end_time - start_time) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) + end_time = timer() + cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) @@ -550,7 +553,8 @@ def train( except Exception as e: msg.warn( "Aborting and saving the final best model. " - "Encountered exception: {}".format(e) + "Encountered exception: {}".format(e), + exits=1, ) finally: best_pipes = nlp.pipe_names diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 15691c4f8..505977be9 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] | `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | | `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | | `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | | `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | | **CREATES** | model | A spaCy model containing the vocab and vectors. | From 3f43c73d37a5c175d0eabb35b9627a18aacd782a Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:57:30 +0200 Subject: [PATCH 077/131] Normalize TokenC.sent_start values for Matcher (#5346) Normalize TokenC.sent_start values to booleans for the `Matcher`. --- spacy/matcher/matcher.pyx | 10 +++++----- spacy/tokens/doc.pxd | 1 + spacy/tokens/doc.pyx | 10 ++++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 7f3c3488f..4cfab915f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -14,7 +14,7 @@ import warnings from ..typedefs cimport attr_t from ..structs cimport TokenC from ..vocab cimport Vocab -from ..tokens.doc cimport Doc, get_token_attr +from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA @@ -549,7 +549,7 @@ cdef char get_is_match(PatternStateC state, spec = state.pattern if spec.nr_attr > 0: for attr in spec.attrs[:spec.nr_attr]: - if get_token_attr(token, attr.attr) != attr.value: + if get_token_attr_for_matcher(token, attr.attr) != attr.value: return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: @@ -720,7 +720,7 @@ class _RegexPredicate(object): if self.is_extension: value = token._.get(self.attr) else: - value = token.vocab.strings[get_token_attr(token.c, self.attr)] + value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] return bool(self.value.search(value)) @@ -741,7 +741,7 @@ class _SetMemberPredicate(object): if self.is_extension: value = get_string_id(token._.get(self.attr)) else: - value = get_token_attr(token.c, self.attr) + value = get_token_attr_for_matcher(token.c, self.attr) if self.predicate == "IN": return value in self.value else: @@ -768,7 +768,7 @@ class _ComparisonPredicate(object): if self.is_extension: value = token._.get(self.attr) else: - value = get_token_attr(token.c, self.attr) + value = get_token_attr_for_matcher(token.c, self.attr) if self.predicate == "==": return value == self.value if self.predicate == "!=": diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7f231887f..6536d271d 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -8,6 +8,7 @@ from ..attrs cimport attr_id_t cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil +cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 867c2bf6b..4dc438695 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -79,6 +79,16 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return Lexeme.get_struct_attr(token.lex, feat_name) +cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil: + if feat_name == SENT_START: + if token.sent_start == 1: + return True + else: + return False + else: + return get_token_attr(token, feat_name) + + def _get_chunker(lang): try: cls = util.get_lang_class(lang) From 74da669326eaa45d878d303643abe88cf4c84d60 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 13:01:25 +0200 Subject: [PATCH 078/131] Fix problems with lower and whitespace in variants (#5361) * Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants * Return the text with original casing if anything goes wrong --- spacy/gold.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index e8274563f..034bba08f 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -331,6 +331,8 @@ class GoldCorpus(object): def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw, paragraph_tuples + raw_orig = str(raw) + lower = False if random.random() >= 0.5: lower = True if raw is not None: @@ -391,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): ids, words, tags, heads, labels, ner = sent_tuples for word in words: match_found = False + # skip whitespace words + if word.isspace(): + match_found = True # add identical word - if word not in variants and raw[raw_idx:].startswith(word): + elif word not in variants and raw[raw_idx:].startswith(word): variant_raw += word raw_idx += len(word) match_found = True @@ -407,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: - return raw, paragraph_tuples + return raw_orig, paragraph_tuples # add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] From 8602daba85bc412918e5cca2101ec15d22b950ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?= Date: Wed, 29 Apr 2020 21:25:22 +0200 Subject: [PATCH 079/131] Swedish like_num (#5371) * Sign contributor agreement. * Add like_num functionality to Swedish. * Update spacy/tests/lang/sv/test_lex_attrs.py Co-Authored-By: Sofie Van Landeghem * Update contributor agreement Co-authored-by: Sofie Van Landeghem --- .github/contributors/vondersam.md | 106 ++++++++++++++++++++++++++ spacy/lang/sv/__init__.py | 2 + spacy/lang/sv/lex_attrs.py | 62 +++++++++++++++ spacy/tests/lang/sv/test_lex_attrs.py | 33 ++++++++ 4 files changed, 203 insertions(+) create mode 100644 .github/contributors/vondersam.md create mode 100644 spacy/lang/sv/lex_attrs.py create mode 100644 spacy/tests/lang/sv/test_lex_attrs.py diff --git a/.github/contributors/vondersam.md b/.github/contributors/vondersam.md new file mode 100644 index 000000000..8add70330 --- /dev/null +++ b/.github/contributors/vondersam.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------| +| Name | Samuel Rodríguez Medina | +| Company name (if applicable) | | +| Title or role (if applicable) | Computational linguist | +| Date | 28 April 2020 | +| GitHub username | vondersam | +| Website (optional) | | diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 671eefca0..3a749eeee 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES # Punctuation stolen from Danish @@ -19,6 +20,7 @@ from .syntax_iterators import SYNTAX_ITERATORS class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "sv" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py new file mode 100644 index 000000000..4b5278c7b --- /dev/null +++ b/spacy/lang/sv/lex_attrs.py @@ -0,0 +1,62 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "noll", + "en", + "ett", + "två", + "tre", + "fyra", + "fem", + "sex", + "sju", + "åtta", + "nio", + "tio", + "elva", + "tolv", + "tretton", + "fjorton", + "femton", + "sexton", + "sjutton", + "arton", + "nitton", + "tjugo", + "trettio", + "fyrtio", + "femtio", + "sextio", + "sjuttio", + "åttio", + "nittio", + "hundra", + "tusen", + "miljon", + "miljard", + "biljon", + "biljard", + "kvadriljon" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py new file mode 100644 index 000000000..abe6b0f7b --- /dev/null +++ b/spacy/tests/lang/sv/test_lex_attrs.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.sv.lex_attrs import like_num + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("10.00", True), + ("999,0", True), + ("en", True), + ("två", True), + ("miljard", True), + ("hund", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(sv_tokenizer, text, match): + tokens = sv_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["elva"]) +def test_sv_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) From ebaed7dcfa31ced738212f726d17285049291d7a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 30 Apr 2020 10:17:06 +0200 Subject: [PATCH 080/131] Few more updates to the EL documentation --- examples/training/train_entity_linker.py | 2 +- website/docs/usage/training.md | 45 ++++++++++++------------ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index c7eba8a30..3a8deb7a0 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -64,7 +64,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" vocab = Vocab().from_disk(vocab_path) - # create blank Language class with correct vocab + # create blank English model with correct vocab nlp = spacy.blank("en", vocab=vocab) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index ecdc6720b..0be14df69 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -619,25 +619,24 @@ https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py #### Step by step guide {#step-by-step-kb} -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and - a pre-defined [`vocab`](/api/vocab) object. -2. **Pretrain the entity embeddings** by running the descriptions of the - entities through a simple encoder-decoder network. The current implementation - requires the `nlp` model to have access to pretrained word embeddings, but a - custom implementation of this encoding step can also be used. -3. **Construct the KB** by defining all entities with their pretrained vectors, - and all aliases with their prior probabilities. +1. **Load the model** you want to start with. It should contain pretrained word + vectors. +2. **Obtain the entity embeddings** by running the descriptions of the entities + through the `nlp` model and taking the average of all words with + `nlp(desc).vector`. At this point, a custom encoding step can also be used. +3. **Construct the KB** by defining all entities with their embeddings, and all + aliases with their prior probabilities. 4. **Save** the KB using [`kb.dump`](/api/kb#dump). -5. **Test** the KB to make sure the entities were added correctly. +5. **Print** the contents of the KB to make sure the entities were added + correctly. ### Training an entity linking model {#entity-linker-model} This example shows how to create an entity linker pipe using a previously -created knowledge base. The entity linker pipe is then trained with your own -examples. To do so, you'll need to provide **example texts**, and the -**character offsets** and **knowledge base identifiers** of each entity -contained in the texts. +created knowledge base. The entity linker is then trained with a set of custom +examples. To do so, you need to provide **example texts**, and the **character +offsets** and **knowledge base identifiers** of each entity contained in the +texts. ```python https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py @@ -647,14 +646,16 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_li 1. **Load the KB** you want to start with, and specify the path to the `Vocab` object that was used to create this KB. Then, create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. - Don't forget to add the KB to the entity linker, and to add the entity linker - to the pipeline. In practical applications, you will want a more advanced - pipeline including also a component for - [named entity recognition](/usage/training#ner). If you're using a model with - additional components, make sure to disable all other pipeline components - during training using [`nlp.disable_pipes`](/api/language#disable_pipes). - This way, you'll only be training the entity linker. + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. Add + a component for recognizing sentences en one for identifying relevant + entities. In practical applications, you will want a more advanced pipeline + including also a component for + [named entity recognition](/usage/training#ner). Then, create a new entity + linker component, add the KB to it, and then add the entity linker to the + pipeline. If you're using a model with additional components, make sure to + disable all other pipeline components during training using + [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + training the entity linker. 2. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through the annotated examples of the input. For each combination of a mention in From 148b036e0cae9eebb6968cea5ecede1ebc7205a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?= Date: Thu, 30 Apr 2020 11:13:23 +0200 Subject: [PATCH 081/131] Spanish like num improvement (#5381) * Add tests for Spanish like_num. * Add missing numbers in Spanish lexical attributes for like_num. * Modify Spanish test function name. * Add contributor agreement. --- spacy/lang/es/lex_attrs.py | 9 +++++++++ spacy/tests/lang/es/test_text.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 03ada1f43..632a638fc 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -26,6 +26,15 @@ _num_words = [ "dieciocho", "diecinueve", "veinte", + "veintiuno", + "veintidós", + "veintitrés", + "veinticuatro", + "veinticinco", + "veintiséis", + "veintisiete", + "veintiocho", + "veintinueve", "treinta", "cuarenta", "cincuenta", diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index acd572b48..e237f922d 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +from spacy.lang.es.lex_attrs import like_num def test_es_tokenizer_handles_long_text(es_tokenizer): @@ -33,3 +34,32 @@ en Montevideo y que pregona las bondades de la vida austera.""" def test_es_tokenizer_handles_cnts(es_tokenizer, text, length): tokens = es_tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("1000", True), + ("999,0", True), + ("uno", True), + ("dos", True), + ("billón", True), + ("veintiséis", True), + ("perro", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(es_tokenizer, text, match): + tokens = es_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["once"]) +def test_es_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) \ No newline at end of file From c045a9c7f637f85f7beccdae48a4cb765516d558 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 1 May 2020 12:05:33 +0200 Subject: [PATCH 082/131] Fix logic in train CLI timing eval on GPU (#5387) Run CPU timing in first iteration only --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index db58b22df..6e6423131 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -456,7 +456,7 @@ def train( gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled - if i >= 1: + if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: From 5e55bfa8214835cf8d407ca6a6a5f8797b4ea005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?= Date: Tue, 5 May 2020 14:06:27 +0200 Subject: [PATCH 083/131] Fixed tests for Swedish that were written in Danish. (#5395) --- spacy/tests/lang/sv/test_exceptions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index c977a4183..7c6fd5464 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -47,15 +47,15 @@ def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text): def test_sv_tokenizer_handles_exc_in_text(sv_tokenizer): - text = "Det er bl.a. ikke meningen" + text = "Det är bl.a. inte meningen" tokens = sv_tokenizer(text) assert len(tokens) == 5 assert tokens[2].text == "bl.a." def test_sv_tokenizer_handles_custom_base_exc(sv_tokenizer): - text = "Her er noget du kan kigge i." + text = "Här är något du kan titta på." tokens = sv_tokenizer(text) assert len(tokens) == 8 - assert tokens[6].text == "i" + assert tokens[6].text == "på" assert tokens[7].text == "." From a2345618f111552e141e128e1d48dd1d0a672a6b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 8 May 2020 10:25:02 +0200 Subject: [PATCH 084/131] Fix Token API docs from #5375 (#5418) --- website/docs/api/token.md | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 7280ac796..24a9dce79 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -351,6 +351,7 @@ property to `0` for the first word of the document. - assert doc[4].sent_start == 1 + assert doc[4].is_sent_start == True ``` + ## Token.is_sent_end {#is_sent_end tag="property" new="2"} From 4a15b559bab705c11acc7d5fce62a73daa5135e7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 8 May 2020 10:36:25 +0200 Subject: [PATCH 085/131] Clarify Token.pos as UPOS (#5419) --- website/docs/api/token.md | 4 ++-- website/docs/usage/101/_pos-deps.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 24a9dce79..69dac23d6 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -477,8 +477,8 @@ The L2 norm of the token's vector representation. | `like_email` | bool | Does the token resemble an email address? | | `is_oov` | bool | Is the token out-of-vocabulary? | | `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech. | -| `pos_` | unicode | Coarse-grained part-of-speech. | +| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `tag` | int | Fine-grained part-of-speech. | | `tag_` | unicode | Fine-grained part-of-speech. | | `dep` | int | Syntactic dependency relation. | diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index 9d04d6ffc..1a438e424 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -25,7 +25,7 @@ for token in doc: > - **Text:** The original word text. > - **Lemma:** The base form of the word. -> - **POS:** The simple part-of-speech tag. +> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) part-of-speech tag. > - **Tag:** The detailed part-of-speech tag. > - **Dep:** Syntactic dependency, i.e. the relation between tokens. > - **Shape:** The word shape – capitalization, punctuation, digits. From c963e269bac9c41222d81abf82131b1937912325 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 8 May 2020 11:21:46 +0200 Subject: [PATCH 086/131] Add method to update / reset pkuseg user dict (#5404) --- spacy/lang/zh/__init__.py | 16 ++++++++++++++++ spacy/tests/lang/zh/test_tokenizer.py | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 701e696a4..ed0b3eb74 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -104,6 +104,22 @@ class ChineseTokenizer(DummyTokenizer): (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) + def pkuseg_update_user_dict(self, words, reset=False): + if self.pkuseg_seg: + if reset: + try: + import pkuseg + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) + except ImportError: + if self.use_pkuseg: + msg = ( + "pkuseg not installed: unable to reset pkuseg " + "user dict. Please " + _PKUSEG_INSTALL_MSG + ) + raise ImportError(msg) + for word in words: + self.pkuseg_seg.preprocesser.insert(word.strip(), '') + def _get_config(self): config = OrderedDict( ( diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index bff7b1ed1..035798aa1 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +from spacy.lang.zh import _get_pkuseg_trie_data # fmt: off @@ -39,6 +40,18 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens): assert tokens == expected_tokens +def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg): + user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"]) + updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + assert len(user_dict) == len(updated_user_dict) - 1 + + # reset user dict + zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True) + reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + assert len(reset_user_dict) == 0 + + def test_extra_spaces(zh_tokenizer_char): # note: three spaces after "I" tokens = zh_tokenizer_char("I like cheese.") From d4cc18b7464e6713d5f0d6f368190cfbdd5c1e18 Mon Sep 17 00:00:00 2001 From: Travis Hoppe Date: Fri, 8 May 2020 02:28:54 -0700 Subject: [PATCH 087/131] Added author information for NLPre (#5414) * Add author links for NLPre and update category * Add contributor statement --- .github/contributors/thoppe.md | 106 +++++++++++++++++++++++++++++++++ website/meta/universe.json | 8 ++- 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/thoppe.md diff --git a/.github/contributors/thoppe.md b/.github/contributors/thoppe.md new file mode 100644 index 000000000..9271a2601 --- /dev/null +++ b/.github/contributors/thoppe.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Travis Hoppe | +| Company name (if applicable) | | +| Title or role (if applicable) | Data Scientist | +| Date | 07 May 2020 | +| GitHub username | thoppe | +| Website (optional) | http://thoppe.github.io/ | diff --git a/website/meta/universe.json b/website/meta/universe.json index 50977b39c..cf587f5f0 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -114,7 +114,13 @@ " text = f(text)", "print(text)" ], - "category": ["scientific"] + "category": ["scientific", "biomedical"], + "author": "Travis Hoppe", + "author_links": { + "github": "thoppe", + "twitter":"metasemantic", + "website" : "http://thoppe.github.io/" + } }, { "id": "Chatterbot", From 440b81bddc24669ffe89ef7501fb8c75f98b60d2 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 8 May 2020 15:10:57 +0200 Subject: [PATCH 088/131] Improve exceptions for 'd (would/had) in English (#5379) Instead of treating `'d` in contractions like `I'd` as `would` in all cases in the tokenizer exceptions, leave the tagging and lemmatization up to later components. --- spacy/lang/en/tokenizer_exceptions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index c45197771..62de81912 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -77,12 +77,12 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]: _exc[orth + "'d"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "'d", NORM: "'d"}, ] _exc[orth + "d"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ @@ -195,7 +195,10 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: {ORTH: "'d", NORM: "'d"}, ] - _exc[orth + "d"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}] + _exc[orth + "d"] = [ + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "d", NORM: "'d"} + ] _exc[orth + "'d've"] = [ {ORTH: orth, LEMMA: word, NORM: word}, From 24e7108f80dd9e4a882b22fe62beda89b73158b6 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 May 2020 10:25:05 +0200 Subject: [PATCH 089/131] Modify array type to accommodate OOV_RANK (#5429) Modify indices array type in `Vocab.prune_vectors` to accommodate OOV_RANK index as max(uint64). --- spacy/vocab.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0f3223025..e31d26f85 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -315,7 +315,7 @@ cdef class Vocab: priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth) for lex in self if lex.orth in self.vectors.key2row] priority.sort() - indices = xp.asarray([i for (prob, i, key) in priority], dtype="i") + indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64") keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) From 07639dd6ac9db6f874d1f01ccb5e37a910924feb Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 May 2020 10:25:54 +0200 Subject: [PATCH 090/131] Remove TAG from da/sv tokenizer exceptions (#5428) Remove `TAG` value from Danish and Swedish tokenizer exceptions because it may not be included in a tag map (and these settings are problematic as tokenizer exceptions anyway). --- spacy/lang/da/tokenizer_exceptions.py | 6 +++--- spacy/lang/sv/tokenizer_exceptions.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 89b083186..9e4637bfb 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others. from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT +from ...symbols import ORTH, LEMMA, NORM _exc = {} @@ -52,7 +52,7 @@ for exc_data in [ {ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Lør.", LEMMA: "lørdag"}, - {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"}, + {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -577,7 +577,7 @@ for h in range(1, 31 + 1): for period in ["."]: _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}] -_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]} +_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} _exc.update(_custom_base_exc) TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index dd0976aa6..e95c67f37 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG +from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA _exc = {} @@ -155,6 +155,6 @@ for orth in ABBREVIATIONS: # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # should be tokenized as two separate tokens. for orth in ["i", "m"]: - _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}] + _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] TOKENIZER_EXCEPTIONS = _exc From 113e7981d0c60f1e200eb0177c97b282927f61ac Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 May 2020 22:08:28 +0200 Subject: [PATCH 091/131] Check that row is within bounds when adding vector (#5430) Check that row is within bounds for the vector data array when adding a vector. Don't add vectors with rank OOV_RANK in `init-model` (change is due to shift from OOV as 0 to OOV as OOV_RANK). --- spacy/cli/init_model.py | 2 +- spacy/errors.py | 2 ++ spacy/tests/vocab_vectors/test_vectors.py | 3 +++ spacy/vectors.pyx | 6 +++++- spacy/vocab.pyx | 2 +- 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 31d627e9b..618266633 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -181,7 +181,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) for lex in nlp.vocab: - if lex.rank: + if lex.rank and lex.rank != OOV_RANK: nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: diff --git a/spacy/errors.py b/spacy/errors.py index 779980490..32ccd3df7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals + def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -555,6 +556,7 @@ class Errors(object): E195 = ("Matcher can be called on {good} only, got {got}.") E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " "only be fixed with token.is_sent_start.") + E197 = ("Row out of bounds, unable to add row {row} for key {key}.") @add_codes diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 8987b7c89..322ef462a 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -307,6 +307,9 @@ def test_vocab_add_vector(): dog = vocab["dog"] assert list(dog.vector) == [2.0, 2.0, 2.0] + with pytest.raises(ValueError): + vocab.vectors.add(vocab["hamster"].orth, row=1000000) + def test_vocab_prune_vectors(): vocab = Vocab(vectors_name="test_vocab_prune_vectors") diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f3c20fb7f..2973ddb5b 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -9,6 +9,7 @@ import functools import numpy from collections import OrderedDict import srsly +import warnings from thinc.neural.util import get_array_module from thinc.neural._classes.model import Model @@ -303,7 +304,10 @@ cdef class Vectors: raise ValueError(Errors.E060.format(rows=self.data.shape[0], cols=self.data.shape[1])) row = deref(self._unset.begin()) - self.key2row[key] = row + if row < self.data.shape[0]: + self.key2row[key] = row + else: + raise ValueError(Errors.E197.format(row=row, key=key)) if vector is not None: self.data[row] = vector if self._unset.count(row): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e31d26f85..ef2e86bcc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -319,7 +319,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name) + self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) remap = {} for i, key in enumerate(keys[nr_row:]): From b04738903e3afc16f10bc3182c256742222ee3f6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 13 May 2020 22:08:50 +0200 Subject: [PATCH 092/131] prevent None in gold fields (#5425) * set gold fields to empty list instead of keeping them as None * add unit test --- spacy/gold.pyx | 10 +++++++++- spacy/tests/parser/test_ner.py | 27 ++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 034bba08f..4b8a4f52d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -658,7 +658,15 @@ cdef class GoldParse: entdoc = None # avoid allocating memory if the doc does not contain any tokens - if self.length > 0: + if self.length == 0: + self.words = [] + self.tags = [] + self.heads = [] + self.labels = [] + self.ner = [] + self.morphology = [] + + else: if words is None: words = [token.text for token in doc] if tags is None: diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8329391ca..244e9fa25 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -7,7 +7,7 @@ from spacy.lang.en import English from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown -from spacy.gold import GoldParse +from spacy.gold import GoldParse, minibatch from spacy.tokens import Doc @@ -174,6 +174,31 @@ def test_accept_blocked_token(): assert ner2.moves.is_valid(state2, "U-") +def test_train_empty(): + """Test that training an empty text does not throw errors.""" + train_data = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("", {"entities": []}), + ] + + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PERSON") + nlp.add_pipe(ner, last=True) + + nlp.begin_training() + for itn in range(2): + losses = {} + batches = minibatch(train_data) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update( + texts, # batch of texts + annotations, # batch of annotations + losses=losses, + ) + + def test_overwrite_token(): nlp = English() ner1 = nlp.create_pipe("ner") From 9ce059dd067ecc3f097d04023e3cfa0d70d35bb8 Mon Sep 17 00:00:00 2001 From: Vishnu Priya VR Date: Thu, 14 May 2020 16:28:06 +0530 Subject: [PATCH 093/131] Limiting noun_chunks for specific languages (#5396) * Limiting noun_chunks for specific langauges * Limiting noun_chunks for specific languages Contributor Agreement * Addressing review comments * Removed unused fixtures and imports * Add fa_tokenizer in test suite * Use fa_tokenizer in test * Undo extraneous reformatting Co-authored-by: adrianeboyd --- .github/contributors/vishnupriyavr.md | 106 ++++++++++++++++++++++++ spacy/lang/de/syntax_iterators.py | 5 ++ spacy/lang/el/syntax_iterators.py | 5 ++ spacy/lang/en/syntax_iterators.py | 5 ++ spacy/lang/es/syntax_iterators.py | 5 ++ spacy/lang/fa/syntax_iterators.py | 5 ++ spacy/lang/fr/syntax_iterators.py | 5 ++ spacy/lang/id/syntax_iterators.py | 5 ++ spacy/lang/nb/syntax_iterators.py | 5 ++ spacy/lang/sv/syntax_iterators.py | 5 ++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/de/test_noun_chunks.py | 16 ++++ spacy/tests/lang/el/test_noun_chunks.py | 16 ++++ spacy/tests/lang/en/test_noun_chunks.py | 15 ++++ spacy/tests/lang/es/test_noun_chunks.py | 16 ++++ spacy/tests/lang/fa/test_noun_chunks.py | 17 ++++ spacy/tests/lang/fr/test_noun_chunks.py | 16 ++++ spacy/tests/lang/id/test_noun_chunks.py | 16 ++++ spacy/tests/lang/nb/test_noun_chunks.py | 16 ++++ spacy/tests/lang/sv/test_noun_chunks.py | 13 +++ spacy/tokens/doc.pyx | 3 +- 21 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/vishnupriyavr.md create mode 100644 spacy/tests/lang/de/test_noun_chunks.py create mode 100644 spacy/tests/lang/el/test_noun_chunks.py create mode 100644 spacy/tests/lang/es/test_noun_chunks.py create mode 100644 spacy/tests/lang/fa/test_noun_chunks.py create mode 100644 spacy/tests/lang/fr/test_noun_chunks.py create mode 100644 spacy/tests/lang/id/test_noun_chunks.py create mode 100644 spacy/tests/lang/nb/test_noun_chunks.py diff --git a/.github/contributors/vishnupriyavr.md b/.github/contributors/vishnupriyavr.md new file mode 100644 index 000000000..73657a772 --- /dev/null +++ b/.github/contributors/vishnupriyavr.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Vishnu Priya VR | +| Company name (if applicable) | Uniphore | +| Title or role (if applicable) | NLP/AI Engineer | +| Date | 2020-05-03 | +| GitHub username | vishnupriyavr | +| Website (optional) | | diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 89d784a0c..13bb857ca 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -28,6 +29,10 @@ def noun_chunks(obj): "app", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_label = doc.vocab.strings.add("NP") np_deps = set(doc.vocab.strings.add(label) for label in labels) close_app = doc.vocab.strings.add("nk") diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5dfd44f07..f02619ac9 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -14,6 +15,10 @@ def noun_chunks(obj): # Further improvement of the models will eliminate the need for this tag. labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") nmod = doc.vocab.strings.add("nmod") diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index ed665ef29..5ff848124 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -20,6 +21,10 @@ def noun_chunks(obj): "ROOT", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 6a78d86f7..0badddca1 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -2,10 +2,15 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON, VERB, AUX +from ...errors import Errors def noun_chunks(obj): doc = obj.doc + + if not doc.is_parsed: + raise ValueError(Errors.E029) + if not len(doc): return np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index ed665ef29..5ff848124 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -20,6 +21,10 @@ def noun_chunks(obj): "ROOT", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 4712d34d9..9495dcf1e 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -19,6 +20,10 @@ def noun_chunks(obj): "nmod:poss", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 4712d34d9..9495dcf1e 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -19,6 +20,10 @@ def noun_chunks(obj): "nmod:poss", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 4712d34d9..9495dcf1e 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -19,6 +20,10 @@ def noun_chunks(obj): "nmod:poss", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 7a82e6b59..148884efe 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors def noun_chunks(obj): @@ -20,6 +21,10 @@ def noun_chunks(obj): "nmod:poss", ] doc = obj.doc # Ensure works on both Doc and Span. + + if not doc.is_parsed: + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index e52c5155f..d26f0ce5c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -88,6 +88,11 @@ def eu_tokenizer(): return get_lang_class("eu").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def fa_tokenizer(): + return get_lang_class("fa").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def fi_tokenizer(): return get_lang_class("fi").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py new file mode 100644 index 000000000..12ece84b5 --- /dev/null +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_de(de_tokenizer): + """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = de_tokenizer("Er lag auf seinem") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py new file mode 100644 index 000000000..be14acc81 --- /dev/null +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_el(el_tokenizer): + """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 7dc47f9cc..1109af150 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -6,9 +6,24 @@ from spacy.attrs import HEAD, DEP from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS +import pytest + + from ...util import get_doc +def test_noun_chunks_is_parsed(en_tokenizer): + """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = en_tokenizer("This is a sentence") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) + + def test_en_noun_chunks_not_nested(en_vocab): words = ["Peter", "has", "chronic", "command", "and", "control", "issues"] heads = [1, 0, 4, 3, -1, -2, -5] diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py new file mode 100644 index 000000000..71069d313 --- /dev/null +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_es(es_tokenizer): + """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = es_tokenizer("en Oxford este verano") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py new file mode 100644 index 000000000..a98aae061 --- /dev/null +++ b/spacy/tests/lang/fa/test_noun_chunks.py @@ -0,0 +1,17 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_fa(fa_tokenizer): + """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + + doc = fa_tokenizer("این یک جمله نمونه می باشد.") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py new file mode 100644 index 000000000..876bc0ea4 --- /dev/null +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_fr(fr_tokenizer): + """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = fr_tokenizer("trouver des travaux antérieurs") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py new file mode 100644 index 000000000..7bac808b3 --- /dev/null +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_id(id_tokenizer): + """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = id_tokenizer("sebelas") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py new file mode 100644 index 000000000..17ec6cfda --- /dev/null +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_noun_chunks_is_parsed_nb(nb_tokenizer): + """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = nb_tokenizer("Smørsausen brukes bl.a. til") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index ac7c066ba..38086c255 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -2,9 +2,22 @@ from __future__ import unicode_literals import pytest + from ...util import get_doc +def test_noun_chunks_is_parsed_sv(sv_tokenizer): + """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = sv_tokenizer("Studenten läste den bästa boken") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) + + SV_NP_TEST_EXAMPLES = [ ( "En student läste en bok", # A student read a book diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4dc438695..25a147208 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -597,8 +597,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#noun_chunks """ - if not self.is_parsed: - raise ValueError(Errors.E029) + # Accumulate the result before beginning to iterate over it. This # prevents the tokenisation from being changed out from under us # during the iteration. The tricky thing here is that Span accepts From a987e9e45d4084f30964a4cec9914ae6ed25a73c Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Thu, 14 May 2020 14:14:15 +0200 Subject: [PATCH 094/131] Fix ErrorsWithCodes().__class__ return value --- spacy/errors.py | 7 +++++-- spacy/tests/test_errors.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/test_errors.py diff --git a/spacy/errors.py b/spacy/errors.py index 32ccd3df7..b97ef3a8e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -7,8 +7,11 @@ def add_codes(err_cls): class ErrorsWithCodes(object): def __getattribute__(self, code): - msg = getattr(err_cls, code) - return "[{code}] {msg}".format(code=code, msg=msg) + if not code.startswith('__'): + msg = getattr(err_cls, code) + return "[{code}] {msg}".format(code=code, msg=msg) + else: + return super().__getattribute__(code) return ErrorsWithCodes() diff --git a/spacy/tests/test_errors.py b/spacy/tests/test_errors.py new file mode 100644 index 000000000..ba24f4456 --- /dev/null +++ b/spacy/tests/test_errors.py @@ -0,0 +1,13 @@ +from inspect import isclass + +from spacy.errors import add_codes + + +@add_codes +class Errors(object): + E001 = "error description" + + +def test_add_codes(): + assert Errors.E001 == "[E001] error description" + assert isclass(Errors.__class__) From 712d9d4820e902abe17b9b7a8ec5ac373b0b8e2d Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Thu, 14 May 2020 15:45:58 +0200 Subject: [PATCH 095/131] fixup! Fix ErrorsWithCodes().__class__ return value --- spacy/errors.py | 10 +++++----- spacy/tests/test_errors.py | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b97ef3a8e..d99c96922 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -5,13 +5,13 @@ from __future__ import unicode_literals def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" - class ErrorsWithCodes(object): + class ErrorsWithCodes(err_cls): def __getattribute__(self, code): - if not code.startswith('__'): - msg = getattr(err_cls, code) - return "[{code}] {msg}".format(code=code, msg=msg) + msg = super().__getattribute__(code) + if code.startswith('__'): # python system attributes like __class__ + return msg else: - return super().__getattribute__(code) + return "[{code}] {msg}".format(code=code, msg=msg) return ErrorsWithCodes() diff --git a/spacy/tests/test_errors.py b/spacy/tests/test_errors.py index ba24f4456..1bd4eec7f 100644 --- a/spacy/tests/test_errors.py +++ b/spacy/tests/test_errors.py @@ -1,5 +1,7 @@ from inspect import isclass +import pytest + from spacy.errors import add_codes @@ -10,4 +12,6 @@ class Errors(object): def test_add_codes(): assert Errors.E001 == "[E001] error description" + with pytest.raises(AttributeError): + Errors.E002 assert isclass(Errors.__class__) From ee8fe37474ac9a0c092acc99ad1f13e8c4b97e2e Mon Sep 17 00:00:00 2001 From: Ilia Ivanov Date: Thu, 14 May 2020 15:59:06 +0200 Subject: [PATCH 096/131] Add ilivans' contributor agreement --- .github/contributors/ilivans.md | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/ilivans.md diff --git a/.github/contributors/ilivans.md b/.github/contributors/ilivans.md new file mode 100644 index 000000000..d471fde48 --- /dev/null +++ b/.github/contributors/ilivans.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Ilia Ivanov | +| Company name (if applicable) | Chattermill | +| Title or role (if applicable) | DL Engineer | +| Date | 2020-05-14 | +| GitHub username | ilivans | +| Website (optional) | | From 780b86934548661817813612debd50964b2e37d3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 14 May 2020 16:51:03 +0200 Subject: [PATCH 097/131] Fix syntax iterators for Persian (#5437) --- spacy/lang/fa/__init__.py | 2 ++ spacy/tests/lang/fa/__init__.py | 0 2 files changed, 2 insertions(+) create mode 100644 spacy/tests/lang/fa/__init__.py diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 9d85f814a..c93bca671 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -10,6 +10,7 @@ from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .punctuation import TOKENIZER_SUFFIXES +from .syntax_iterators import SYNTAX_ITERATORS class PersianDefaults(Language.Defaults): @@ -24,6 +25,7 @@ class PersianDefaults(Language.Defaults): tag_map = TAG_MAP suffixes = TOKENIZER_SUFFIXES writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} + syntax_iterators = SYNTAX_ITERATORS class Persian(Language): diff --git a/spacy/tests/lang/fa/__init__.py b/spacy/tests/lang/fa/__init__.py new file mode 100644 index 000000000..e69de29bb From e63880e0812b4bf45a8f4a96bc26c3f4a10d9fb7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 14 May 2020 18:22:51 +0200 Subject: [PATCH 098/131] Use Token.sent_start for Span.sent (#5439) Use `Token.sent_start` for sentence boundaries in `Span.sent` so that `Doc.sents` and `Span.sent` return the same sentence boundaries. --- spacy/tokens/span.pyx | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 347916a0a..2f1418a5b 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -389,19 +389,9 @@ cdef class Span: return self.doc.user_span_hooks["sent"](self) # This should raise if not parsed / no custom sentence boundaries self.doc.sents - # If doc is parsed we can use the deps to find the sentence - # otherwise we use the `sent_start` token attribute + # Use `sent_start` token attribute to find sentence boundaries cdef int n = 0 - cdef int i - if self.doc.is_parsed: - root = &self.doc.c[self.start] - while root.head != 0: - root += root.head - n += 1 - if n >= self.doc.length: - raise RuntimeError(Errors.E038) - return self.doc[root.l_edge:root.r_edge + 1] - elif self.doc.is_sentenced: + if self.doc.is_sentenced: # Find start of the sentence start = self.start while self.doc.c[start].sent_start != 1 and start > 0: From f49e2810e6ea5c8b848df5b0f393c27ee31bb7f4 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 14 May 2020 18:23:19 +0200 Subject: [PATCH 099/131] Add Polish lemmatizer (#5413) * Add Polish lemmatizer Contributed by @ryszardtuora * Add missing import --- setup.cfg | 2 +- spacy/lang/pl/__init__.py | 8 +++ spacy/lang/pl/lemmatizer.py | 107 ++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/pl/lemmatizer.py diff --git a/setup.cfg b/setup.cfg index 3e0acd12f..af3579f88 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.0.5,<0.2.0 + spacy_lookups_data>=0.3.1,<0.4.0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 702a19063..0540bf535 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -6,12 +6,14 @@ from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups +from ...lookups import Lookups class PolishDefaults(Language.Defaults): @@ -26,6 +28,12 @@ class PolishDefaults(Language.Defaults): tag_map = TAG_MAP infixes = TOKENIZER_INFIXES + @classmethod + def create_lemmatizer(cls, nlp=None, lookups=None): + if lookups is None: + lookups = Lookups() + return PolishLemmatizer(lookups) + class Polish(Language): lang = "pl" diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py new file mode 100644 index 000000000..2be4b0fb7 --- /dev/null +++ b/spacy/lang/pl/lemmatizer.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...lemmatizer import Lemmatizer +from ...parts_of_speech import NAMES +from ...errors import Errors + + +class PolishLemmatizer(Lemmatizer): + # This lemmatizer implements lookup lemmatization based on + # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS + # It utilizes some prefix based improvements for + # verb and adjectives lemmatization, as well as case-sensitive + # lemmatization for nouns + def __init__(self, lookups, *args, **kwargs): + # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules + super().__init__(lookups) + self.lemma_lookups = {} + for tag in [ + "ADJ", + "ADP", + "ADV", + "AUX", + "NOUN", + "NUM", + "PART", + "PRON", + "VERB", + "X", + ]: + self.lemma_lookups[tag] = self.lookups.get_table( + "lemma_lookup_" + tag.lower(), {} + ) + self.lemma_lookups["DET"] = self.lemma_lookups["X"] + self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"] + + def __call__(self, string, univ_pos, morphology=None): + if isinstance(univ_pos, int): + univ_pos = NAMES.get(univ_pos, "X") + univ_pos = univ_pos.upper() + + if univ_pos == "NOUN": + return self.lemmatize_noun(string, morphology) + + if univ_pos != "PROPN": + string = string.lower() + + if univ_pos == "ADJ": + return self.lemmatize_adj(string, morphology) + elif univ_pos == "VERB": + return self.lemmatize_verb(string, morphology) + + lemma_dict = self.lemma_lookups.get(univ_pos, {}) + return [lemma_dict.get(string, string.lower())] + + def lemmatize_adj(self, string, morphology): + # this method utilizes different procedures for adjectives + # with 'nie' and 'naj' prefixes + lemma_dict = self.lemma_lookups["ADJ"] + + if string[:3] == "nie": + search_string = string[3:] + if search_string[:3] == "naj": + naj_search_string = search_string[3:] + if naj_search_string in lemma_dict: + return [lemma_dict[naj_search_string]] + if search_string in lemma_dict: + return [lemma_dict[search_string]] + + if string[:3] == "naj": + naj_search_string = string[3:] + if naj_search_string in lemma_dict: + return [lemma_dict[naj_search_string]] + + return [lemma_dict.get(string, string)] + + def lemmatize_verb(self, string, morphology): + # this method utilizes a different procedure for verbs + # with 'nie' prefix + lemma_dict = self.lemma_lookups["VERB"] + + if string[:3] == "nie": + search_string = string[3:] + if search_string in lemma_dict: + return [lemma_dict[search_string]] + + return [lemma_dict.get(string, string)] + + def lemmatize_noun(self, string, morphology): + # this method is case-sensitive, in order to work + # for incorrectly tagged proper names + lemma_dict = self.lemma_lookups["NOUN"] + + if string != string.lower(): + if string.lower() in lemma_dict: + return [lemma_dict[string.lower()]] + elif string in lemma_dict: + return [lemma_dict[string]] + return [string.lower()] + + return [lemma_dict.get(string, string)] + + def lookup(self, string, orth=None): + return string.lower() + + def lemmatize(self, string, index, exceptions, rules): + raise NotImplementedError From 908dea39399bbc0c966c131796f339af5de54140 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 14 May 2020 18:26:12 +0200 Subject: [PATCH 100/131] Skip duplicate lexeme rank setting (#5401) Skip duplicate lexeme rank setting within `_fix_pretrained_vectors_name()`. --- spacy/_ml.py | 13 +++++++------ spacy/language.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 5cccabac1..60a0bbee0 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -279,18 +279,19 @@ class PrecomputableAffine(Model): break -def link_vectors_to_models(vocab): +def link_vectors_to_models(vocab, skip_rank=False): vectors = vocab.vectors if vectors.name is None: vectors.name = VECTORS_KEY if vectors.data.size != 0: warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) ops = Model.ops - for word in vocab: - if word.orth in vectors.key2row: - word.rank = vectors.key2row[word.orth] - else: - word.rank = util.OOV_RANK + if not skip_rank: + for word in vocab: + if word.orth in vectors.key2row: + word.rank = vectors.key2row[word.orth] + else: + word.rank = util.OOV_RANK data = ops.asarray(vectors.data) # Set an entry here, so that vectors are accessed by StaticVectors # (unideal, I know) diff --git a/spacy/language.py b/spacy/language.py index e89f80f08..f23776def 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1072,7 +1072,7 @@ def _fix_pretrained_vectors_name(nlp): else: raise ValueError(Errors.E092) if nlp.vocab.vectors.size != 0: - link_vectors_to_models(nlp.vocab) + link_vectors_to_models(nlp.vocab, skip_rank=True) for name, proc in nlp.pipeline: if not hasattr(proc, "cfg"): continue From 72a25c9cef5c69316517650850b2ad7c04b63e01 Mon Sep 17 00:00:00 2001 From: Ilkyu Ju Date: Sun, 17 May 2020 20:43:34 +0900 Subject: [PATCH 101/131] Very minor issues in Korean example sentences (#5446) * Add contributor agreement * Improve ko translation of example sentences I fixed unnatural translations and word spacing errors. * Update osori.md --- .github/contributors/osori.md | 106 ++++++++++++++++++++++++++++++++++ spacy/lang/ko/examples.py | 6 +- 2 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 .github/contributors/osori.md diff --git a/.github/contributors/osori.md b/.github/contributors/osori.md new file mode 100644 index 000000000..93b5c7dd4 --- /dev/null +++ b/.github/contributors/osori.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ilkyu Ju | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-05-17 | +| GitHub username | osori | +| Website (optional) | | diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py index 7885ad801..0306e5db8 100644 --- a/spacy/lang/ko/examples.py +++ b/spacy/lang/ko/examples.py @@ -9,8 +9,8 @@ Example sentences to test spaCy and its language models. """ sentences = [ - "애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.", - "자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.", - "자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.", + "애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.", + "자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다", + "샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.", "런던은 영국의 수도이자 가장 큰 도시입니다.", ] From a5cd2032843b26fbff9d6e0b53637e9477af3f7f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 19 May 2020 15:59:14 +0200 Subject: [PATCH 102/131] Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal --- spacy/attrs.pxd | 2 +- spacy/attrs.pyx | 2 +- spacy/cli/init_model.py | 7 +- spacy/cli/train.py | 10 - spacy/lang/da/__init__.py | 8 +- spacy/lang/da/norm_exceptions.py | 527 ---- spacy/lang/de/__init__.py | 9 +- spacy/lang/de/norm_exceptions.py | 16 - spacy/lang/el/__init__.py | 9 +- spacy/lang/el/norm_exceptions.py | 2642 ----------------- spacy/lang/en/__init__.py | 9 +- spacy/lang/en/norm_exceptions.py | 1768 ----------- spacy/lang/id/__init__.py | 9 +- spacy/lang/id/norm_exceptions.py | 532 ---- spacy/lang/lb/__init__.py | 9 +- spacy/lang/lb/norm_exceptions.py | 16 - spacy/lang/lex_attrs.py | 15 - spacy/lang/pt/__init__.py | 9 +- spacy/lang/pt/norm_exceptions.py | 23 - spacy/lang/ru/__init__.py | 9 +- spacy/lang/ru/norm_exceptions.py | 36 - spacy/lang/sr/__init__.py | 9 +- spacy/lang/sr/norm_exceptions.py | 26 - spacy/lang/ta/norm_exceptions.py | 139 - spacy/lang/th/__init__.py | 9 +- spacy/lang/th/norm_exceptions.py | 113 - spacy/language.py | 6 +- spacy/lexeme.pxd | 24 +- spacy/lexeme.pyx | 84 +- spacy/lookups.py | 8 +- spacy/structs.pxd | 23 - spacy/symbols.pxd | 2 +- spacy/symbols.pyx | 2 +- spacy/tests/lang/da/test_exceptions.py | 8 - spacy/tests/lang/de/test_exceptions.py | 14 - spacy/tests/lang/en/test_exceptions.py | 1 + spacy/tests/lang/lb/test_exceptions.py | 6 - .../serialize/test_serialize_vocab_strings.py | 24 +- spacy/tests/test_lemmatizer.py | 2 +- spacy/tests/vocab_vectors/test_lexeme.py | 13 - spacy/tests/vocab_vectors/test_lookups.py | 6 +- spacy/tests/vocab_vectors/test_vectors.py | 12 + spacy/tokens/token.pyx | 10 +- spacy/vocab.pxd | 1 + spacy/vocab.pyx | 134 +- 45 files changed, 161 insertions(+), 6182 deletions(-) delete mode 100644 spacy/lang/da/norm_exceptions.py delete mode 100644 spacy/lang/de/norm_exceptions.py delete mode 100644 spacy/lang/el/norm_exceptions.py delete mode 100644 spacy/lang/en/norm_exceptions.py delete mode 100644 spacy/lang/id/norm_exceptions.py delete mode 100644 spacy/lang/lb/norm_exceptions.py delete mode 100644 spacy/lang/pt/norm_exceptions.py delete mode 100644 spacy/lang/ru/norm_exceptions.py delete mode 100644 spacy/lang/sr/norm_exceptions.py delete mode 100644 spacy/lang/ta/norm_exceptions.py delete mode 100644 spacy/lang/th/norm_exceptions.py diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 8f583b3a3..805dc2950 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -15,7 +15,7 @@ cdef enum attr_id_t: LIKE_NUM LIKE_EMAIL IS_STOP - IS_OOV + IS_OOV_DEPRECATED IS_BRACKET IS_QUOTE IS_LEFT_PUNCT diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 2187f3c65..fe9895d06 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -16,7 +16,7 @@ IDS = { "LIKE_NUM": LIKE_NUM, "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, - "IS_OOV": IS_OOV, + "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED, "IS_BRACKET": IS_BRACKET, "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 618266633..3311a5120 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -157,15 +157,11 @@ def create_model(lang, lex_attrs, name=None): nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = OOV_RANK - lex_added = 0 for attrs in lex_attrs: if "settings" in attrs: continue lexeme = nlp.vocab[attrs["orth"]] lexeme.set_attrs(**attrs) - lexeme.is_oov = False - lex_added += 1 - lex_added += 1 if len(nlp.vocab): oov_prob = min(lex.prob for lex in nlp.vocab) - 1 else: @@ -193,8 +189,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): if vector_keys is not None: for word in vector_keys: if word not in nlp.vocab: - lexeme = nlp.vocab[word] - lexeme.is_oov = False + nlp.vocab[word] if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if name is None: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6e6423131..7cb2d9745 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -15,7 +15,6 @@ import random from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu -from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus from ..compat import path2str from .. import util @@ -630,15 +629,6 @@ def _create_progress_bar(total): def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) - for lex in nlp.vocab: - values = {} - for attr, func in nlp.vocab.lex_attr_getters.items(): - # These attrs are expected to be set by data. Others should - # be set by calling the language functions. - if attr not in (CLUSTER, PROB, IS_OOV, LANG): - values[lex.vocab.strings[attr]] = func(lex.orth_) - lex.set_attrs(**values) - lex.is_oov = False def _load_pretrained_tok2vec(nlp, loc): diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index ac8c04954..92eec44b2 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -12,17 +11,14 @@ from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "da" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) morph_rules = MORPH_RULES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py deleted file mode 100644 index dbffdb88b..000000000 --- a/spacy/lang/da/norm_exceptions.py +++ /dev/null @@ -1,527 +0,0 @@ -# coding: utf8 -""" -Special-case rules for normalizing tokens to improve the model's predictions. -For example 'mysterium' vs 'mysterie' and similar. -""" -from __future__ import unicode_literals - - -# Sources: -# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/ -# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/ - -_exc = { - # Alternative spelling - "a-kraft-værk": "a-kraftværk", # 1 - "ålborg": "aalborg", # 2 - "århus": "aarhus", - "accessoirer": "accessoires", # 1 - "affektert": "affekteret", # 1 - "afrikander": "afrikaaner", # 1 - "aftabuere": "aftabuisere", # 1 - "aftabuering": "aftabuisering", # 1 - "akvarium": "akvarie", # 1 - "alenefader": "alenefar", # 1 - "alenemoder": "alenemor", # 1 - "alkoholambulatorium": "alkoholambulatorie", # 1 - "ambulatorium": "ambulatorie", # 1 - "ananassene": "ananasserne", # 2 - "anførelsestegn": "anførselstegn", # 1 - "anseelig": "anselig", # 2 - "antioxydant": "antioxidant", # 1 - "artrig": "artsrig", # 1 - "auditorium": "auditorie", # 1 - "avocado": "avokado", # 2 - "bagerst": "bagest", # 2 - "bagstræv": "bagstræb", # 1 - "bagstræver": "bagstræber", # 1 - "bagstræverisk": "bagstræberisk", # 1 - "balde": "balle", # 2 - "barselorlov": "barselsorlov", # 1 - "barselvikar": "barselsvikar", # 1 - "baskien": "baskerlandet", # 1 - "bayrisk": "bayersk", # 1 - "bedstefader": "bedstefar", # 1 - "bedstemoder": "bedstemor", # 1 - "behefte": "behæfte", # 1 - "beheftelse": "behæftelse", # 1 - "bidragydende": "bidragsydende", # 1 - "bidragyder": "bidragsyder", # 1 - "billiondel": "billiontedel", # 1 - "blaseret": "blasert", # 1 - "bleskifte": "bleskift", # 1 - "blodbroder": "blodsbroder", # 2 - "blyantspidser": "blyantsspidser", # 2 - "boligministerium": "boligministerie", # 1 - "borhul": "borehul", # 1 - "broder": "bror", # 2 - "buldog": "bulldog", # 2 - "bådhus": "bådehus", # 1 - "børnepleje": "barnepleje", # 1 - "børneseng": "barneseng", # 1 - "børnestol": "barnestol", # 1 - "cairo": "kairo", # 1 - "cambodia": "cambodja", # 1 - "cambodianer": "cambodjaner", # 1 - "cambodiansk": "cambodjansk", # 1 - "camouflage": "kamuflage", # 2 - "campylobacter": "kampylobakter", # 1 - "centeret": "centret", # 2 - "chefskahyt": "chefkahyt", # 1 - "chefspost": "chefpost", # 1 - "chefssekretær": "chefsekretær", # 1 - "chefsstol": "chefstol", # 1 - "cirkulærskrivelse": "cirkulæreskrivelse", # 1 - "cognacsglas": "cognacglas", # 1 - "columnist": "kolumnist", # 1 - "cricket": "kricket", # 2 - "dagplejemoder": "dagplejemor", # 1 - "damaskesdug": "damaskdug", # 1 - "damp-barn": "dampbarn", # 1 - "delfinarium": "delfinarie", # 1 - "dentallaboratorium": "dentallaboratorie", # 1 - "diaramme": "diasramme", # 1 - "diaré": "diarré", # 1 - "dioxyd": "dioxid", # 1 - "dommedagsprædiken": "dommedagspræken", # 1 - "donut": "doughnut", # 2 - "driftmæssig": "driftsmæssig", # 1 - "driftsikker": "driftssikker", # 1 - "driftsikring": "driftssikring", # 1 - "drikkejogurt": "drikkeyoghurt", # 1 - "drivein": "drive-in", # 1 - "driveinbiograf": "drive-in-biograf", # 1 - "drøvel": "drøbel", # 1 - "dødskriterium": "dødskriterie", # 1 - "e-mail-adresse": "e-mailadresse", # 1 - "e-post-adresse": "e-postadresse", # 1 - "egypten": "ægypten", # 2 - "ekskommunicere": "ekskommunikere", # 1 - "eksperimentarium": "eksperimentarie", # 1 - "elsass": "Alsace", # 1 - "elsasser": "alsacer", # 1 - "elsassisk": "alsacisk", # 1 - "elvetal": "ellevetal", # 1 - "elvetiden": "ellevetiden", # 1 - "elveårig": "elleveårig", # 1 - "elveårs": "elleveårs", # 1 - "elveårsbarn": "elleveårsbarn", # 1 - "elvte": "ellevte", # 1 - "elvtedel": "ellevtedel", # 1 - "energiministerium": "energiministerie", # 1 - "erhvervsministerium": "erhvervsministerie", # 1 - "espaliere": "spaliere", # 2 - "evangelium": "evangelie", # 1 - "fagministerium": "fagministerie", # 1 - "fakse": "faxe", # 1 - "fangstkvota": "fangstkvote", # 1 - "fader": "far", # 2 - "farbroder": "farbror", # 1 - "farfader": "farfar", # 1 - "farmoder": "farmor", # 1 - "federal": "føderal", # 1 - "federalisering": "føderalisering", # 1 - "federalisme": "føderalisme", # 1 - "federalist": "føderalist", # 1 - "federalistisk": "føderalistisk", # 1 - "federation": "føderation", # 1 - "federativ": "føderativ", # 1 - "fejlbeheftet": "fejlbehæftet", # 1 - "femetagers": "femetages", # 2 - "femhundredekroneseddel": "femhundredkroneseddel", # 2 - "filmpremiere": "filmpræmiere", # 2 - "finansimperium": "finansimperie", # 1 - "finansministerium": "finansministerie", # 1 - "firehjulstræk": "firhjulstræk", # 2 - "fjernstudium": "fjernstudie", # 1 - "formalier": "formalia", # 1 - "formandsskift": "formandsskifte", # 1 - "fornemst": "fornemmest", # 2 - "fornuftparti": "fornuftsparti", # 1 - "fornuftstridig": "fornuftsstridig", # 1 - "fornuftvæsen": "fornuftsvæsen", # 1 - "fornuftægteskab": "fornuftsægteskab", # 1 - "forretningsministerium": "forretningsministerie", # 1 - "forskningsministerium": "forskningsministerie", # 1 - "forstudium": "forstudie", # 1 - "forsvarsministerium": "forsvarsministerie", # 1 - "frilægge": "fritlægge", # 1 - "frilæggelse": "fritlæggelse", # 1 - "frilægning": "fritlægning", # 1 - "fristille": "fritstille", # 1 - "fristilling": "fritstilling", # 1 - "fuldttegnet": "fuldtegnet", # 1 - "fødestedskriterium": "fødestedskriterie", # 1 - "fødevareministerium": "fødevareministerie", # 1 - "følesløs": "følelsesløs", # 1 - "følgeligt": "følgelig", # 1 - "førne": "førn", # 1 - "gearskift": "gearskifte", # 2 - "gladeligt": "gladelig", # 1 - "glosehefte": "glosehæfte", # 1 - "glædeløs": "glædesløs", # 1 - "gonoré": "gonorré", # 1 - "grangiveligt": "grangivelig", # 1 - "grundliggende": "grundlæggende", # 2 - "grønsag": "grøntsag", # 2 - "gudbenådet": "gudsbenådet", # 1 - "gudfader": "gudfar", # 1 - "gudmoder": "gudmor", # 1 - "gulvmop": "gulvmoppe", # 1 - "gymnasium": "gymnasie", # 1 - "hackning": "hacking", # 1 - "halvbroder": "halvbror", # 1 - "halvelvetiden": "halvellevetiden", # 1 - "handelsgymnasium": "handelsgymnasie", # 1 - "hefte": "hæfte", # 1 - "hefteklamme": "hæfteklamme", # 1 - "heftelse": "hæftelse", # 1 - "heftemaskine": "hæftemaskine", # 1 - "heftepistol": "hæftepistol", # 1 - "hefteplaster": "hæfteplaster", # 1 - "heftestraf": "hæftestraf", # 1 - "heftning": "hæftning", # 1 - "helbroder": "helbror", # 1 - "hjemmeklasse": "hjemklasse", # 1 - "hjulspin": "hjulspind", # 1 - "huggevåben": "hugvåben", # 1 - "hulmurisolering": "hulmursisolering", # 1 - "hurtiggående": "hurtigtgående", # 2 - "hurtigttørrende": "hurtigtørrende", # 2 - "husmoder": "husmor", # 1 - "hydroxyd": "hydroxid", # 1 - "håndmikser": "håndmixer", # 1 - "højtaler": "højttaler", # 2 - "hønemoder": "hønemor", # 1 - "ide": "idé", # 2 - "imperium": "imperie", # 1 - "imponerthed": "imponerethed", # 1 - "inbox": "indboks", # 2 - "indenrigsministerium": "indenrigsministerie", # 1 - "indhefte": "indhæfte", # 1 - "indheftning": "indhæftning", # 1 - "indicium": "indicie", # 1 - "indkassere": "inkassere", # 2 - "iota": "jota", # 1 - "jobskift": "jobskifte", # 1 - "jogurt": "yoghurt", # 1 - "jukeboks": "jukebox", # 1 - "justitsministerium": "justitsministerie", # 1 - "kalorifere": "kalorifer", # 1 - "kandidatstipendium": "kandidatstipendie", # 1 - "kannevas": "kanvas", # 1 - "kaperssauce": "kaperssovs", # 1 - "kigge": "kikke", # 2 - "kirkeministerium": "kirkeministerie", # 1 - "klapmydse": "klapmyds", # 1 - "klimakterium": "klimakterie", # 1 - "klogeligt": "klogelig", # 1 - "knivblad": "knivsblad", # 1 - "kollegaer": "kolleger", # 2 - "kollegium": "kollegie", # 1 - "kollegiehefte": "kollegiehæfte", # 1 - "kollokviumx": "kollokvium", # 1 - "kommissorium": "kommissorie", # 1 - "kompendium": "kompendie", # 1 - "komplicerthed": "komplicerethed", # 1 - "konfederation": "konføderation", # 1 - "konfedereret": "konfødereret", # 1 - "konferensstudium": "konferensstudie", # 1 - "konservatorium": "konservatorie", # 1 - "konsulere": "konsultere", # 1 - "kradsbørstig": "krasbørstig", # 2 - "kravsspecifikation": "kravspecifikation", # 1 - "krematorium": "krematorie", # 1 - "krep": "crepe", # 1 - "krepnylon": "crepenylon", # 1 - "kreppapir": "crepepapir", # 1 - "kricket": "cricket", # 2 - "kriterium": "kriterie", # 1 - "kroat": "kroater", # 2 - "kroki": "croquis", # 1 - "kronprinsepar": "kronprinspar", # 2 - "kropdoven": "kropsdoven", # 1 - "kroplus": "kropslus", # 1 - "krøllefedt": "krølfedt", # 1 - "kulturministerium": "kulturministerie", # 1 - "kuponhefte": "kuponhæfte", # 1 - "kvota": "kvote", # 1 - "kvotaordning": "kvoteordning", # 1 - "laboratorium": "laboratorie", # 1 - "laksfarve": "laksefarve", # 1 - "laksfarvet": "laksefarvet", # 1 - "laksrød": "lakserød", # 1 - "laksyngel": "lakseyngel", # 1 - "laksørred": "lakseørred", # 1 - "landbrugsministerium": "landbrugsministerie", # 1 - "landskampstemning": "landskampsstemning", # 1 - "langust": "languster", # 1 - "lappegrejer": "lappegrej", # 1 - "lavløn": "lavtløn", # 1 - "lillebroder": "lillebror", # 1 - "linear": "lineær", # 1 - "loftlampe": "loftslampe", # 2 - "log-in": "login", # 1 - "login": "log-in", # 2 - "lovmedholdig": "lovmedholdelig", # 1 - "ludder": "luder", # 2 - "lysholder": "lyseholder", # 1 - "lægeskifte": "lægeskift", # 1 - "lærvillig": "lærevillig", # 1 - "løgsauce": "løgsovs", # 1 - "madmoder": "madmor", # 1 - "majonæse": "mayonnaise", # 1 - "mareridtagtig": "mareridtsagtig", # 1 - "margen": "margin", # 2 - "martyrium": "martyrie", # 1 - "mellemstatlig": "mellemstatslig", # 1 - "menneskene": "menneskerne", # 2 - "metropolis": "metropol", # 1 - "miks": "mix", # 1 - "mikse": "mixe", # 1 - "miksepult": "mixerpult", # 1 - "mikser": "mixer", # 1 - "mikserpult": "mixerpult", # 1 - "mikslån": "mixlån", # 1 - "miksning": "mixning", # 1 - "miljøministerium": "miljøministerie", # 1 - "milliarddel": "milliardtedel", # 1 - "milliondel": "milliontedel", # 1 - "ministerium": "ministerie", # 1 - "mop": "moppe", # 1 - "moder": "mor", # 2 - "moratorium": "moratorie", # 1 - "morbroder": "morbror", # 1 - "morfader": "morfar", # 1 - "mormoder": "mormor", # 1 - "musikkonservatorium": "musikkonservatorie", # 1 - "muslingskal": "muslingeskal", # 1 - "mysterium": "mysterie", # 1 - "naturalieydelse": "naturalydelse", # 1 - "naturalieøkonomi": "naturaløkonomi", # 1 - "navnebroder": "navnebror", # 1 - "nerium": "nerie", # 1 - "nådeløs": "nådesløs", # 1 - "nærforestående": "nærtforestående", # 1 - "nærstående": "nærtstående", # 1 - "observatorium": "observatorie", # 1 - "oldefader": "oldefar", # 1 - "oldemoder": "oldemor", # 1 - "opgraduere": "opgradere", # 1 - "opgraduering": "opgradering", # 1 - "oratorium": "oratorie", # 1 - "overbookning": "overbooking", # 1 - "overpræsidium": "overpræsidie", # 1 - "overstatlig": "overstatslig", # 1 - "oxyd": "oxid", # 1 - "oxydere": "oxidere", # 1 - "oxydering": "oxidering", # 1 - "pakkenellike": "pakkenelliker", # 1 - "papirtynd": "papirstynd", # 1 - "pastoralseminarium": "pastoralseminarie", # 1 - "peanutsene": "peanuttene", # 2 - "penalhus": "pennalhus", # 2 - "pensakrav": "pensumkrav", # 1 - "pepperoni": "peperoni", # 1 - "peruaner": "peruvianer", # 1 - "petrole": "petrol", # 1 - "piltast": "piletast", # 1 - "piltaste": "piletast", # 1 - "planetarium": "planetarie", # 1 - "plasteret": "plastret", # 2 - "plastic": "plastik", # 2 - "play-off-kamp": "playoffkamp", # 1 - "plejefader": "plejefar", # 1 - "plejemoder": "plejemor", # 1 - "podium": "podie", # 2 - "praha": "prag", # 2 - "preciøs": "pretiøs", # 2 - "privilegium": "privilegie", # 1 - "progredere": "progrediere", # 1 - "præsidium": "præsidie", # 1 - "psykodelisk": "psykedelisk", # 1 - "pudsegrejer": "pudsegrej", # 1 - "referensgruppe": "referencegruppe", # 1 - "referensramme": "referenceramme", # 1 - "refugium": "refugie", # 1 - "registeret": "registret", # 2 - "remedium": "remedie", # 1 - "remiks": "remix", # 1 - "reservert": "reserveret", # 1 - "ressortministerium": "ressortministerie", # 1 - "ressource": "resurse", # 2 - "resætte": "resette", # 1 - "rettelig": "retteligt", # 1 - "rettetaste": "rettetast", # 1 - "returtaste": "returtast", # 1 - "risici": "risikoer", # 2 - "roll-on": "rollon", # 1 - "rollehefte": "rollehæfte", # 1 - "rostbøf": "roastbeef", # 1 - "rygsæksturist": "rygsækturist", # 1 - "rødstjært": "rødstjert", # 1 - "saddel": "sadel", # 2 - "samaritan": "samaritaner", # 2 - "sanatorium": "sanatorie", # 1 - "sauce": "sovs", # 1 - "scanning": "skanning", # 2 - "sceneskifte": "sceneskift", # 1 - "scilla": "skilla", # 1 - "sejflydende": "sejtflydende", # 1 - "selvstudium": "selvstudie", # 1 - "seminarium": "seminarie", # 1 - "sennepssauce": "sennepssovs ", # 1 - "servitutbeheftet": "servitutbehæftet", # 1 - "sit-in": "sitin", # 1 - "skatteministerium": "skatteministerie", # 1 - "skifer": "skiffer", # 2 - "skyldsfølelse": "skyldfølelse", # 1 - "skysauce": "skysovs", # 1 - "sladdertaske": "sladretaske", # 2 - "sladdervorn": "sladrevorn", # 2 - "slagsbroder": "slagsbror", # 1 - "slettetaste": "slettetast", # 1 - "smørsauce": "smørsovs", # 1 - "snitsel": "schnitzel", # 1 - "snobbeeffekt": "snobeffekt", # 2 - "socialministerium": "socialministerie", # 1 - "solarium": "solarie", # 1 - "soldebroder": "soldebror", # 1 - "spagetti": "spaghetti", # 1 - "spagettistrop": "spaghettistrop", # 1 - "spagettiwestern": "spaghettiwestern", # 1 - "spin-off": "spinoff", # 1 - "spinnefiskeri": "spindefiskeri", # 1 - "spolorm": "spoleorm", # 1 - "sproglaboratorium": "sproglaboratorie", # 1 - "spækbræt": "spækkebræt", # 2 - "stand-in": "standin", # 1 - "stand-up-comedy": "standupcomedy", # 1 - "stand-up-komiker": "standupkomiker", # 1 - "statsministerium": "statsministerie", # 1 - "stedbroder": "stedbror", # 1 - "stedfader": "stedfar", # 1 - "stedmoder": "stedmor", # 1 - "stilehefte": "stilehæfte", # 1 - "stipendium": "stipendie", # 1 - "stjært": "stjert", # 1 - "stjærthage": "stjerthage", # 1 - "storebroder": "storebror", # 1 - "stortå": "storetå", # 1 - "strabads": "strabadser", # 1 - "strømlinjet": "strømlinet", # 1 - "studium": "studie", # 1 - "stænkelap": "stænklap", # 1 - "sundhedsministerium": "sundhedsministerie", # 1 - "suppositorium": "suppositorie", # 1 - "svejts": "schweiz", # 1 - "svejtser": "schweizer", # 1 - "svejtserfranc": "schweizerfranc", # 1 - "svejtserost": "schweizerost", # 1 - "svejtsisk": "schweizisk", # 1 - "svigerfader": "svigerfar", # 1 - "svigermoder": "svigermor", # 1 - "svirebroder": "svirebror", # 1 - "symposium": "symposie", # 1 - "sælarium": "sælarie", # 1 - "søreme": "sørme", # 2 - "søterritorium": "søterritorie", # 1 - "t-bone-steak": "t-bonesteak", # 1 - "tabgivende": "tabsgivende", # 1 - "tabuere": "tabuisere", # 1 - "tabuering": "tabuisering", # 1 - "tackle": "takle", # 2 - "tackling": "takling", # 2 - "taifun": "tyfon", # 1 - "take-off": "takeoff", # 1 - "taknemlig": "taknemmelig", # 2 - "talehørelærer": "tale-høre-lærer", # 1 - "talehøreundervisning": "tale-høre-undervisning", # 1 - "tandstik": "tandstikker", # 1 - "tao": "dao", # 1 - "taoisme": "daoisme", # 1 - "taoist": "daoist", # 1 - "taoistisk": "daoistisk", # 1 - "taverne": "taverna", # 1 - "teateret": "teatret", # 2 - "tekno": "techno", # 1 - "temposkifte": "temposkift", # 1 - "terrarium": "terrarie", # 1 - "territorium": "territorie", # 1 - "tesis": "tese", # 1 - "tidsstudium": "tidsstudie", # 1 - "tipoldefader": "tipoldefar", # 1 - "tipoldemoder": "tipoldemor", # 1 - "tomatsauce": "tomatsovs", # 1 - "tonart": "toneart", # 1 - "trafikministerium": "trafikministerie", # 1 - "tredve": "tredive", # 1 - "tredver": "trediver", # 1 - "tredveårig": "trediveårig", # 1 - "tredveårs": "trediveårs", # 1 - "tredveårsfødselsdag": "trediveårsfødselsdag", # 1 - "tredvte": "tredivte", # 1 - "tredvtedel": "tredivtedel", # 1 - "troldunge": "troldeunge", # 1 - "trommestikke": "trommestik", # 1 - "trubadur": "troubadour", # 2 - "trøstepræmie": "trøstpræmie", # 2 - "tummerum": "trummerum", # 1 - "tumultuarisk": "tumultarisk", # 1 - "tunghørighed": "tunghørhed", # 1 - "tus": "tusch", # 2 - "tusind": "tusinde", # 2 - "tvillingbroder": "tvillingebror", # 1 - "tvillingbror": "tvillingebror", # 1 - "tvillingebroder": "tvillingebror", # 1 - "ubeheftet": "ubehæftet", # 1 - "udenrigsministerium": "udenrigsministerie", # 1 - "udhulning": "udhuling", # 1 - "udslaggivende": "udslagsgivende", # 1 - "udspekulert": "udspekuleret", # 1 - "udviklingsministerium": "udviklingsministerie", # 1 - "uforpligtigende": "uforpligtende", # 1 - "uheldvarslende": "uheldsvarslende", # 1 - "uimponerthed": "uimponerethed", # 1 - "undervisningsministerium": "undervisningsministerie", # 1 - "unægtelig": "unægteligt", # 1 - "urinale": "urinal", # 1 - "uvederheftig": "uvederhæftig", # 1 - "vabel": "vable", # 2 - "vadi": "wadi", # 1 - "vaklevorn": "vakkelvorn", # 1 - "vanadin": "vanadium", # 1 - "vaselin": "vaseline", # 1 - "vederheftig": "vederhæftig", # 1 - "vedhefte": "vedhæfte", # 1 - "velar": "velær", # 1 - "videndeling": "vidensdeling", # 2 - "vinkelanførelsestegn": "vinkelanførselstegn", # 1 - "vipstjært": "vipstjert", # 1 - "vismut": "bismut", # 1 - "visvas": "vissevasse", # 1 - "voksværk": "vokseværk", # 1 - "værtdyr": "værtsdyr", # 1 - "værtplante": "værtsplante", # 1 - "wienersnitsel": "wienerschnitzel", # 1 - "yderliggående": "yderligtgående", # 2 - "zombi": "zombie", # 1 - "ægbakke": "æggebakke", # 1 - "ægformet": "æggeformet", # 1 - "ægleder": "æggeleder", # 1 - "ækvilibrist": "ekvilibrist", # 2 - "æselsøre": "æseløre", # 1 - "øjehule": "øjenhule", # 1 - "øjelåg": "øjenlåg", # 1 - "øjeåbner": "øjenåbner", # 1 - "økonomiministerium": "økonomiministerie", # 1 - "ørenring": "ørering", # 2 - "øvehefte": "øvehæfte", # 1 -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index dee1841c8..ca01428ba 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP @@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class GermanDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "de" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py deleted file mode 100644 index 3dbd4c7e3..000000000 --- a/spacy/lang/de/norm_exceptions.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# Here we only want to include the absolute most common words. Otherwise, -# this list would get impossibly long for German – especially considering the -# old vs. new spelling rules, and all possible cases. - - -_exc = {"daß": "dass"} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 6d551cc4e..d03a42da9 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lookups import Lookups -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class GreekDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "el" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py deleted file mode 100644 index d4384ff3c..000000000 --- a/spacy/lang/el/norm_exceptions.py +++ /dev/null @@ -1,2642 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -# These exceptions are used to add NORM values based on a token's ORTH value. -# Norms are only set if no alternative is provided in the tokenizer exceptions. - -_exc = { - "αγιορίτης": "αγιορείτης", - "αγόρι": "αγώρι", - "έωλος": "αίολος", - "αλλοίθωρος": "αλλήθωρος", - "αλλοιώς": "αλλιώς", - "αλλοιώτικος": "αλλκότικος", - "αναµιγνύω": "αναµειγνύω", - "ανάµιξη": "ανάµειξη", - "ανανδρεία": "ανανδρία", - "αναφιλυτό": "αναφιλητό", - "ανελλειπώς": "ανελλιπώς", - "ανεξιθρησκεία": "ανεξιθρησκία", - "αντικρυνός": "αντικρινός", - "απάγκιο": "απάγκεω", - "αρµατωλός": "αρµατολός", - "αρρώστεια": "αρρώστια", - "ατόφιος": "ατόφυος", - "αφίνω": "αφήνω", - "χιβάδα": "χηβάδα", - "αχρηστεία": "αχρηστία", - "βαρυγκωµώ": "βαρυγγωµώ", - "βεβαρυµένος": "βεβαρηµένος", - "βερύκοκκο": "βερίκοκο", - "βλήτο": "βλίτο", - "βογκώ": "βογγώ", - "βραδυά": "βραδιά", - "βραδυάζει": "βραδίάζει", - "Βρεταννία": "Βρετανία", - "Βρεττανία": "Βρετανία", - "βολοδέρνω": "βωλοδέρνω", - "γέλοιο": "γέλιο", - "γκάµα": "γκάµµα", - "γλύφω": "γλείφω", - "γλήνα": "γλίνα", - "διαφήµηση": "διαφήµιση", - "δικλείδα": "δικλίδα", - "διοξείδιο": "διοξίδιο", - "διορία": "διωρία", - "δυόροφος": "διώροφος", - "δυόµισυ": "δυόµισι", - "διόσµος": "δυόσμος", - "δυσφήμιση": "δυσφήµηση", - "δοσίλογος": "δωσίλογος", - "εγχείριση": "εγχείρηση", - "ειδωλολατρεία": "ειδωλολατρία", - "εληά": "ελιά", - "ελιξίριο": "ελιξήριο", - "έλκυθρο": "έλκηθρο", - "ελλειπής": "ελλίπής", - "ενάµισυς": "ενάµισης", - "ενάµισυ": "ενάµισι", - "ενανθρώπιση": "ενανθρώπηση", - "έξη": "έξι", - "επί τούτο": "επί τούτω", - "εταιρία": "εταιρεία", - "εφορεία": "εφορία", - "ζηλειάρης": "ζηλιάρης", - "Θεοφάνεια": "Θεοφάνια", - "καυγάς": "καβγάς", - "καθίκι": "καθοίκι", - "καινούριος": "καινούργιος", - "κακάβι": "κακκάβι", - "κακαβιά": "κακκαβιά", - "καµµία": "καµία", - "κανέλα": "Καννέλα", - "κανονιοφόρος": "κανονιοφόρος", - "καντίλι": "καντήλι", - "κατεβοδώνω": "κατευοδώνω", - "κοίτοµαι": "κείτοµαι", - "κελαϊδώ": "κελαηδώ", - "κυάλια": "κιάλια", - "κλύδωνας": "κλήδονας", - "κλωτσώ": "κλοτσώ", - "κολλιτσίδα": "κολλητσίδα", - "κουκί": "κουκκί", - "κουλός": "κουλλός", - "κρεββάτι": "κρεβάτι", - "κροκόδειλος": "κροκόδιλος", - "κοβιός": "κωβιός", - "λάκισα": "λάκησα", - "λιµέρι": "ληµέρι", - "λώξυγγας": "λόξυγγας", - "µαγγούρα": "µαγκούρα", - "µαζή": "μαζί", - "µακρυά": "µακριά", - "µαµή": "µαµµή", - "µαµόθρεφτος": "µαµµόθρεφτος", - "µίγµα": "µείγµα", - "µίξη": "µείξη", - "µετώπη": "µετόπη", - "µυρολόι": "µοιρολόι", - "µοτοσικλέτα": "µοτοσυκλέτα", - "µπαλωµατής": "µπαλλωµατής", - "µιζίθρα": "µυζήθρα", - "νεοτερίζω": "νεωτερίζω", - "νεοτερισµός": "νεωτερισμός", - "νεοτεριστής": "νεωτεριστής", - "νινί": "νηνί", - "νοιώθω": "νιώθω", - "νονός": "νοννός", - "ξενιτιά": "ξενιτειά", - "ξαίρω": "ξέρω", - "ξίγκι": "ξίγγι", - "ξείδι": "ξίδι", - "ξώβεργα": "ξόβεργα", - "ξιπάζω": "ξυπάζω", - "ξιπασµένος": "ξυπασµένος", - "ξυπόλητος": "ξυπόλυτος", - "ξωκλήσι": "ξωκκλήσι", - "οξυά": "οξιά", - "ορθοπεδικός": "ορθοπαιδικός", - "ωχ": "οχ", - "παπάς": "παππάς", - "παραγιός": "παραγυιός", - "περηφάνεια": "περηφάνια", - "πιλάλα": "πηλάλα", - "πίννα": "πίνα", - "πηρούνι": "πιρούνι", - "πιτσιλώ": "πιτσυλώ", - "πιτσιλίζω": "πιτσυλίζω", - "πλατυάζω": "πλατειάζω", - "πληµµυρίδα": "πληµυρίδα", - "πληγούρι": "πλιγούρι", - "πωπώ": "ποπό", - "πουγγί": "πουγκί", - "πρίγκηπας": "πρίγκιπας", - "προάστειο": "προάστιο", - "προεδρεία": "προεδρία", - "πρίµα": "πράµα", - "πρωτήτερα": "πρωτύτερα", - "προτύτερα": "πρωτύτερα", - "πόρωση": "πώρωση", - "ρεβύθι": "ρεβίθι", - "ρέγγα": "ρέΥκα", - "ρηγώνω": "ριγώνω", - "ρωµανικός": "ροµανικός", - "ρίζι": "ρύζι", - "Ρώσσος": "Ρώσος", - "σακκούλα": "σακούλα", - "συνάφι": "σινάφι", - "σειρίτι": "σιρίτι", - "σιφόνι": "σιφώνι", - "συχαίνοµαι": "σιχαίνοµαι", - "σκιρόδεµα": "σκυρόδεµα", - "σπάγγος": "σπάγκος", - "στυλιάρι": "στειλιάρι", - "στοιβάδα": "στιβάδα", - "στίβα": "στοίβα", - "στριµώνω": "στρυµώνω", - "στριμώχνω": "στρυμώχνω", - "συγχύζω": "συγχίζω", - "σηκώτι": "συκώτι", - "σιναγρίδα": "συναγρίδα", - "συνοδεία": "συνοδία", - "σίφιλη": "σύφιλη", - "τανιέµαι": "τανυέµαι", - "τανίζω": "τανύζω", - "τέσσερις": "τέσσερεις", - "τζιτζιφιά": "τζιτζυφιά", - "τόνος": "τόννος", - "τοπείο": "τοπίο", - "τρέλλα": "τρέλα", - "τσαγγάρης": "τσαγκάρης", - "τσανάκα": "τσαννάκα", - "τσανακογλείφτης": "τσαννακογλείφτης", - "τσιτώνω": "τσητώνω", - "τσιγκλώ": "τσυγκλώ", - "τσίµα": "τσύµα", - "υννί": "υνί", - "υπερηφάνια": "υπερηφάνεια", - "υπόχρεως": "υπόχρεος", - "φάκελλος": "φάκελος", - "φείδι": "φίδι", - "φιλονεικώ": "φιλονικώ", - "φιλονεικία": "φιλονικία", - "φυρί-φυρί": "φιρί-φιρί", - "φτιάνω": "φτειάχνω", - "φτιάχνω": "φτειάχνω", - "φτώχεια": "φτώχια", - "φυσαλίδα": "φυσαλλίδα", - "χάνος": "χάννος", - "χυνόπωρο": "χινόπωρο", - "χεινόπωρο": "χινόπωρο", - "χιµίζω": "χυµίζω", - "χιμίζω": "χυμιζώ", - "γκωλ": "γκολ", - "αιρκοντίσιον": "ερκοντίσιον", - "καρµπυρατέρ": "καρµπφατέρ", - "κυλόττα": "κιλότα", - "κλή ρινγκ": "κλίρινγκ", - "κωλγκέρλ": "κολγκέρλ", - "κοµπιναιζόν": "κοµπινεζόν", - "κοπυράιτ": "κοπιράιτ", - "µυλαίδη": "µιλέδη", - "µποϋκοτάζ": "µποϊκοτάζ", - "πέναλτυ": "πέναλτι", - "πορτραίτο": "πορτρέτο", - "ρεστωράν": "ρεστοράν", - "ροσµπήφ": "ροσµπίφ", - "σαντιγύ": "σαντιγί", - "στριπτήζ": "στριπτίζ", - "ταµπλώ": "ταµπλό", - "τζόκεϋ": "τζόκεϊ", - "φουτµπώλ": "φουτµπόλ", - "τρόλλεϋ": "τρόλεϊ", - "χίππυ": "χίπι", - "φέρρυ-µπωτ": "φεριµπότ", - "χειρούργος": "χειρουργός", - "αβαείο": "αββαείο", - "αβάς": "αββάς", - "αβάσκαµα": "βάσκαµα", - "αβασκανία": "βασκανία", - "αβάφτιστος": "αβάπτιστος", - "αβάφτιστη": "αβάπτιστη", - "αβάφτιστο": "αβάπτιστο", - "αβγίλα": "αβγουλίλα", - "αυτί": "αφτί", - "αβδέλλα": "βδέλλα", - "Αβράµ": "'Αβραάµ", - "αγγινάρα": "αγκινάρα", - "αγγόνα": "εγγονή", - "αγγόνι": "εγγόνι", - "αγγονός": "εγγονός", - "άγειρτος": "άγερτος", - "άγειρτη": "άγερτη", - "άγειρτο": "άγερτο", - "αγέρας": "αέρας", - "αγκλέουρας": "αγλέορας", - "αγκλίτοα": "γκλίτσα", - "Αγκόλα": "Ανγκόλα", - "αγκορά": "ανγκορά", - "αγκοστοίιρα": "ανγκοστούρα", - "άγνεστος": "άγνεθος", - "άγνεστη": "άγνεθη", - "άγνεστο": "άγνεθο", - "αγώρι": "αγόρι", - "αγωρίστικος": "αγορίστικος", - "αγωρίστικη": "αγορίστικη", - "αγωρίστικο": "αγορίστικο", - "αγωροκόριτσο": "αγοροκόριστο", - "αγουρόλαδο": "αγουρέλαιο", - "αγροικώ": "γροικώ", - "αδάµαντας": "αδάµας", - "αδερφή": "αδελφή", - "αδέρφι": "αδέλφι", - "αδερφικός": "αδελφικός", - "αδερφική": "αδελφική", - "αδερφικό": "αδελφικό", - "αδερφοποιτός": "αδελφοποιτός", - "αδερφός": "αδελφός", - "αδερφοσύνη": "αδελφοσύνη", - "αέρι": "αγέρι", - "αερόµπικ": "αεροβική", - "αεροστρόβιλος": "αεριοστρόβιλος", - "αητός": "αετός", - "αιµατοποσία": "αιµοποσία", - "άιντε": "άντε", - "αισθηµατισµός": "συναισθηµατισµός", - "αιτιακός": "αιτιώδης", - "αιτιακή": "αιτιώδης", - "αιτιακό": "αιτιώδες", - "ακατανόµαστος": "ακατονόµαστος", - "ακατανόμαστη": "ακατονόμαστη", - "ακατονόμαστο": "ακατανόμαστο", - "ακέραιος": "ακέριος", - "ακέραια": "ακέρια", - "ακέραιο": "ακέριο", - "άκρον": "άκρο", - "ακτύπητος": "αχτύπητος", - "ακτύπητη": "αχτύπητη", - "ακτύπητο": "αχτύπητο", - "ακυριολεκτώ": "ακυρολεκτώ", - "ακυριολεξία": "ακυρολεξία", - "αλάτι": "άλας", - "αλατένιος": "αλάτινος", - "αλατένια": "αλάτινη", - "αλατένιο": "αλάτινο", - "αλαφραίνω": "ελαφρώνω", - "αλαφριός": "ελαφρύς", - "αλαφριό": "ελαφρύ", - "αλαφρόµυαλος": "ελαφρόµυαλος", - "αλαφρόμυαλη": "ελαφρόμυαλη", - "αλαφρόμυαλο": "ελαφρόμυαλο", - "αλείβω": "αλείφω", - "άλευρο": "αλεύρι", - "αλησµονησιά": "λησµονιά", - "αλκολίκι": "αλκοολίκι", - "αλλέως": "αλλιώς", - "αλληλοεπίδραση": "αλληλεπίδραση", - "αλλήθωρος": "αλλοίθωρος", - "αλλήθωρη": "αλλοίθωρη", - "αλλήθωρο": "αλλοίθωρο", - "αλλοίµονο": "αλίµονο", - "αµνηστεία": "αµνηστία", - "αµπαρόριζα": "αρµπαρόριζα", - "αµπέχωνο": "αµπέχονο", - "αµυγδαλάτος": "αµυγδαλωτός", - "αμυγδαλάτη": "αμυγδαλωτή", - "αμυγδαλάτο": "αμυγδαλωτό", - "αµυγδαλόλαδο": "αµυγδαλέλαιο", - "αµφίλογος": "αµφιλεγόµενος", - "αμφίλογη": "αμφιλεγόμενη", - "αμφίλογο": "αμφιλεγόμενο", - "αναβατός": "ανεβατός", - "αναβατή": "ανεβατή", - "αναβατό": "ανεβατό", - "αναδεχτός": "αναδεκτός", - "αναθρέφω": "ανατρέφω", - "ανακατώνω": "ανακατεύω", - "ανακάτωση": "ανακάτεµα", - "αναλίσκω": "αναλώνω", - "αναμειγνύω": "αναμιγνύω", - "αναμείκτης": "αναμίκτης", - "ανάµεικτος": "ανάµικτος", - "ανάμεικτη": "ανάμικτη", - "ανάμεικτο": "ανάμικτο", - "αναπαµός": "ανάπαυση", - "αναπαρασταίνω": "αναπαριστάνω", - "ανάπρωρος": "ανάπλωρος", - "ανάπρωρη": "ανάπλωρη", - "ανάπρωρο": "ανάπλωρο", - "αναπτυγµένος": "ανεπτυγμένος", - "αναπτυγµένη": "ανεπτυγμένη", - "αναπτυγµένο": "ανεπτυγμένο", - "άναστρος": "ανάστερος", - "αναστυλώνω": "αναστηλώνω", - "αναστύλωση": "αναστήλωση", - "ανεγνωρισµένος": "αναγνωρισµένος", - "αναγνωρισμένη": "αναγνωρισµένη", - "αναγνωρισμένο": "αναγνωρισµένο", - "ανέµυαλος": "άμυαλος", - "ανέμυαλη": "άμυαλη", - "ανέμυαλο": "άμυαλο", - "ανεπάντεχος": "αναπάντεχος", - "ανεπάντεχη": "αναπάντεχη", - "ανεπάντεχο": "αναπάντεχο", - "ανεψιά": "ανιψιά", - "ανεψιός": "ανιψιός", - "ανήρ": "άνδρας", - "ανηφόρι": "ανήφορος", - "ανηψιά": "ανιψιά", - "ανηψιός": "ανιψιός", - "άνθιση": "άνθηση", - "ανταλλάζω": "ανταλλάσσω", - "ανταπεξέρχοµαι": "αντεπεξέρχοµαι", - "αντζούγια": "αντσούγια", - "αντιεισαγγελέας": "αντεισαγγελέας", - "αντικατασταίνω": "αντικαθιστώ", - "αντικρύζω": "αντικρίζω", - "αντιµολία": "αντιµωλία", - "αντιπροσωπεία": "αντιπροσωπία", - "αντισταµινικό": "αντιισταµινικός", - "αντίχτυπος": "αντίκτυπος", - "άντρας": "άνδρας", - "αντρόγυνο": "ανδρόγυνο", - "αντρώνω": "ανδρώνω", - "άξια": "άξιος", - "απακούµπι": "αποκούµπι", - "απαλάµη": "παλάµη", - "Απαλάχια": "Αππαλάχια", - "απάνω": "επάνω", - "απέδρασα": "αποδιδράσκω", - "απλούς": "απλός", - "απλούν": "απλό", - "απόγαιο": "απόγειο", - "αποδείχνω": "αποδεικνύω", - "αποθαµός": "πεθαµός", - "αποθανατίζω": "απαθανατίζω", - "αποκεντροποίηση": "αποκέντρωση", - "απολαυή": "απολαβή", - "αποξεραίνω": "αποξηραίνω", - "απόξυοη": "απόξεση", - "απόξω": "απέξω", - "απόσχω": "απέχω", - "αποτίω": "αποτίνω", - "αποτυχαίνω": "αποτυγχάνω", - "αποχαιρετίζω": "αποχαιρετώ", - "απόχτηµα": "απόκτηµα", - "απόχτηση": "απόκτηση", - "αποχτώ": "αποκτώ", - "Απρίλης": "Απρίλιος", - "αρκαντάσης": "καρντάσης", - "αρµάρι": "ερµάριο", - "άρµη": "άλµη", - "αρµοστεία": "αρµοστία", - "άρµπουρο": "άλµπουρο", - "αρµύρα": "αλµύρα", - "αρµυρίκι": "αλµυρίκι", - "άρρην": "άρρεν", - "αρσανάς": "ταρσανάς", - "αρτύνω": "αρταίνω", - "αρχινίζω": "αρχίζω", - "αρχινώ": "αρχίζω", - "αρχίτερα": "αρχύτερα", - "ασκηµάδα": "ασχήµια", - "ασκηµαίνω": "ασχηµαίνω", - "ασκήµια": "ασχήµια", - "ασκηµίζω": "ασχηµίζω", - "άσσος": "άσος", - "αστράπτω": "αστράφτω", - "αστράπτω": "αστράφτω", - "αταχτώ": "ατακτώ", - "ατσάλινος": "ατσαλένιος", - "ατσάλινη": "ατσαλένια", - "ατσάλινο": "ατσαλένιο", - "Ατσιγγάνος": "Τσιγγάνος", - "Ατσίγγανος": "Τσιγγάνος", - "αυγαταίνω": "αβγατίζω", - "αυγατίζω": "αβγατίζω", - "αυγό": "αβγό", - "αυγοειδής": "αυγοειδής", - "αυγοειδές": "αβγοειδές", - "αυγοθήκη": "αβγοθήκη", - "αυγοκόβω": "αβγοκόβω", - "αυγοτάραχο": "αβγοτάραχο", - "αύλακας": "αυλάκι", - "αυτί": "αφτί", - "αυτιάζοµαι": "αφτιάζοµαι", - "αφορεσµός": "αφορισµός", - "άφρονας": "άφρων", - "αχείλι": "χείλι", - "άχερο": "άχυρο", - "αχερώνας": "αχυρώνας", - "αχιβάδα": "αχηβάδα", - "αχτίδα": "ακτίνα", - "βαβουίνος": "µπαµπουίνος", - "Βαγγέλης": "Ευάγγελος", - "βαγγέλιο": "ευαγγέλιο", - "Βάγια": "Βάί'α", - "βαζιβουζούκος": "βασιβουζούκος", - "βαθύνω": "βαθαίνω", - "βάιο": "βάγιο", - "βακαλάος": "µπακαλιάρος", - "βαλάντιο": "βαλλάντιο", - "βαλαντώνω": "βαλλαντώνω", - "βάνω": "βάζω", - "βαρειά": "βαριά", - "βαριεστίζω": "βαργεστώ", - "βαριεστώ": "βαργεστώ", - "βαρώ": "βαράω", - "βαρώνος": "βαρόνος", - "βασιλέας": "βασιλιάς", - "βασµούλος": "γασµούλος", - "Βαυαρία": "Βαβαρία", - "Βαυαροκρατία": "Βαβαροκρατία", - "βαφτίζω": "βαπτίζω", - "βάφτιση": "βάπτιση", - "βάφτισµα": "βάπτισµα", - "βαφτιστής": "βαπτιστής", - "βαφτιστικός": "βαπτιστικός", - "βαφτιστική": "βαπτιστική", - "βαφτιστικιά": "βαπτιστική", - "βαφτιστικό": "βαπτιστικό", - "βδοµάδα": "εβδοµάδα", - "βεγόνια": "µπιγκόνια", - "βελανίδι": "βαλανίδι", - "βελανιδιά": "βαλανιδιά", - "βενζίνα": "βενζίνη", - "βεράτιο": "µπεράτι", - "βερόκοκο": "βερίκοκο", - "βιγόνια": "µπιγκόνια", - "βλάφτω": "βλάπτω", - "βλογιά": "ευλογιά", - "βλογάω": "ευλογώ", - "βογγίζω": "βογγώ", - "βόγγος": "βογγητό", - "βογκητό": "βογγητό", - "βοδάµαξα": "βοϊδάµαξα", - "βόλλεϋ": "βόλεϊ", - "βολοκοπώ": "βωλοκοπώ", - "βόλος": "βώλος", - "βουβάλι": "βούβαλος", - "βουή": "βοή", - "βούλα": "βούλλα", - "βούλωµα": "βούλλωµα", - "βουλώνω": "βουλλώνω", - "βουρβόλακας": "βρικόλακας", - "βουρκόλακας": "βρικόλακας", - "βους": "βόδι", - "βραδι": "βράδυ", - "βρυκόλακας": "βρικόλακας", - "βρώµα": "βρόµα", - "βρώµη": "βρόµη", - "βρωµιά": "βροµιά", - "βρωµίζω": "βροµίζω", - "βρώµιο": "βρόµιο", - "βρωµώ": "βροµώ", - "βωξίτης": "βοξίτης", - "γάβρος": "γαύρος", - "γαϊδάρα": "γαϊδούρα", - "γαίµα": "αίµα", - "γαλακτόπιτα": "γαλατόπιτα", - "γάµα": "γάµµα", - "γαµβρός": "γαµπρός", - "γαρίφαλο": "γαρύφαλλο", - "γαρούφαλλο": "γαρύφαλλο", - "γαυγίζω": "γαβγίζω", - "γελάδα": "αγελάδα", - "γελέκο": "γιλέκο", - "γένοµαι": "γίνοµαι", - "γενότυπος": "γονότυπος", - "Γένουα": "Γένοβα", - "γεράζω": "γερνώ", - "γέρακας": "γεράκι", - "γερατειά": "γηρατειά", - "γεροκοµείο": "γηροκοµείο", - "γεροκοµώ": "γηροκοµώ", - "Γεσθηµανή": "Γεθσηµανή", - "γεώδης": "γαιώδης", - "γαιώδες": "γαιώδες", - "γηρασµός": "γήρανση", - "Γιάννενα": "Ιωάννινα", - "Γιάννινα": "Ιωάννινα", - "γιάνω": "γιαίνω", - "γιαουρτλού": "γιογουρτλού", - "Γιαπωνέζος": "Ιαπωνέζος", - "γιγαντεύω": "γιγαντώνω", - "γιεγιές": "γεγές", - "Γιεν": "γεν", - "γιέσµαν": "γέσµαν", - "γιόκας": "γυιόκας", - "γιορτασµός": "εορτασµός", - "γιος": "γυιος", - "Γιούλης": "Ιούλιος", - "Γιούνης": "Ιούνιος", - "γιοφύρι": "γεφύρι", - "Γιώργος": "Γεώργιος", - "γιωτ": "γιοτ", - "γιωτακισµός": "ιωτακισµός", - "γκάγκστερ": "γκάνγκστερ", - "γκαγκστερισµός": "γκανγκστερισµός", - "γκαµήλα": "καµήλα", - "γκεµπελίσκος": "γκαιµπελίσκος", - "γκιουβέτσι": "γιουβέτσι", - "γκιώνης": "γκιόνης", - "γκλοµπ": "κλοµπ", - "γκογκ": "γκονγκ", - "Γκιόνα": "Γκιώνα", - "γκόρφι": "γκόλφι", - "γκρα": "γκρας", - "Γκράβαρα": "Κράβαρα", - "γκυ": "γκι", - "γλαϋξ": "γλαύκα", - "γλιτώνω": "γλυτώνω", - "γλύκισµα": "γλύκυσµα", - "γλυστρώ": "γλιστρώ", - "γλωσσίδα": "γλωττίδα", - "γνέφαλλο": "γνάφαλλο", - "γνοιάζοµαι": "νοιάζοµαι", - "γόµα": "γόµµα", - "γόνα": "γόνατο", - "γονιός": "γονέας", - "γόπα": "γώπα", - "γούµενος": "ηγούµενος", - "γουµένισσα": "ηγουµένη", - "γουώκµαν": "γουόκµαν", - "γραία": "γριά", - "Γράµος": "Γράµµος", - "γρασίδι": "γρασσίδι", - "γρεγολεβάντες": "γραιγολεβάντες", - "γρέγος": "γραίγος", - "γρικώ": "γροικώ", - "Γροιλανδία": "Γροιλανδία", - "γρίνια": "γκρίνια", - "γροθοκοπώ": "γρονθοκοπώ", - "γρούµπος": "γρόµπος", - "γυαλοπωλείο": "υαλοπωλείο", - "γυρνώ": "γυρίζω", - "γόρωθε": "γύροθε", - "γωβιός": "κωβιός", - "δάγκάµα": "δάγκωµα", - "δαγκαµατιά": "δαγκωµατιά", - "δαγκανιά": "δαγκωνιά", - "δαιµονοπληξία": "δαιµονιόπληκτος", - "δαίµων": "δαίµονας", - "δακτυλήθρα": "δαχτυλήθρα", - "δακτυλίδι": "δαχτυλίδι", - "∆αυίδ": "∆αβίδ", - "δαχτυλογραφία": "δακτυλογραφία", - "δαχτυλογράφος": "δακτυλογράφος", - "δεικνύω": "δείχνω", - "δείλι": "δειλινό", - "δείχτης": "δείκτης", - "δελής": "ντελής", - "δενδρογαλή": "δεντρογαλιά", - "δεντρολίβανο": "δενδρολίβανο", - "δεντροστοιχία": "δενδροστοιχία", - "δεντροφυτεία": "δενδροφυτεία", - "δεντροφυτεύω": "δενδροφυτεύω", - "δεντρόφυτος": "δενδρόφυτος", - "δεξής": "δεξιό", - "δερµατώδης": "δερµατοειδής", - "δερματώδες": "δερµατοειδές", - "δέσποτας": "δεσπότης", - "δεφτέρι": "τεφτέρι", - "διαβατάρης": "διαβάτης", - "διάβηκα": "διαβαίνω", - "διαβιβρώσκω": "διαβρώνω", - "διαθρέψω": "διατρέφω", - "διακόνεµα": "διακονιά", - "διάολος": "διάβολος", - "∆ιαµαντής": "Αδαµάντιος", - "διαολιά": "διαβολιά", - "διαολογυναίκα": "διαβολογυναίκα", - "διαολοθήλυκο": "διαβολοθήλυκο", - "διαολόκαιρος": "διαβολόκαιρος", - "διαολοκόριτσο": "διαβολοκόριτσο", - "διαολόπαιδο": "διαβολόπαιδο", - "διάολος": "διάβολος", - "διασκελιά": "δρασκελιά", - "διαχύνω": "διαχέω", - "δίδω": "δίνω", - "δίκηο": "δίκιο", - "δοβλέτι": "ντοβλέτι", - "δοσίλογος": "δωσίλογος", - "δράχνω": "αδράχνω", - "δρέπανο": "δρεπάνι", - "δρόσος": "δροσιά", - "δώνω": "δίνω", - "εγγίζω": "αγγίζω", - "εδώθε": "δώθε", - "εδωνά": "εδωδά", - "εικοσάρι": "εικοσάρικο", - "εικών": "εικόνα", - "εισαγάγω": "εισάγω", - "εισήγαγα": "εισάγω", - "εισήχθην": "εισάγω", - "έκαμα": "έκανα", - "εκατόν": "εκατό", - "εκατοστάρης": "κατοστάρης", - "εκατοστάρι": "κατοστάρι", - "εκατοστάρικο": "κατοστάρικο", - "εκλαίρ": "εκλέρ", - "Ελδοράδο": "Ελντοράντο", - "ελευθεροτεκτονισµός": "τεκτονισµός", - "ελευτεριά": "ελευθερία", - "Ελεφαντοστού Ακτή": "Ακτή Ελεφαντοστού", - "ελληνικάδικο": "ελληνάδικο", - "Ελπίδα": "Ελπίς", - "εµορφιά": "οµορφιά", - "εµορφάδα": "οµορφιά", - "έµπορας": "έµπορος", - "εµώ": "εξεµώ", - "ένδεκα": "έντεκα", - "ενενήκοντα": "ενενήντα", - "ενωρίς": "νωρίς", - "εξανέστην": "εξανίσταµαι", - "εξήκοντα": "εξήντα", - "έξις": "έξη", - "εξωκκλήσι": "ξωκκλήσι", - "εξωµερίτης": "ξωµερίτης", - "επανωφόρι": "πανωφόρι", - "επιµειξία": "επιµιξία", - "επίστοµα": "απίστοµα", - "επτάζυµο": "εφτάζυµο", - "επταήµερος": "εφταηµερος", - "επταθέσιος": "εφταθέσιος", - "επταµελής": "εφταµελης", - "επταµηνία": "εφταµηνία", - "επταµηνίτικος": "εφταµηνίτικος", - "επταπλασιάζω": "εφταπλασιάζω", - "επταπλάσιος": "εφταπλάσιος", - "επτασύλλαβος": "εφτασύλλαβος", - "επτατάξιος": "εφτατάξιος", - "επτάτοµος": "εφτάτοµος", - "επτάφυλλος": "εφτάφυλλος", - "επτάχρονα": "εφτάχρονα", - "επτάχρονος": "εφτάχρονος", - "επταψήφιος": "εφταψήφιος", - "επτάωρος": "εφτάωρος", - "επταώροφος": "εφταώροφος", - "έργον": "έργο", - "ευκή": "ευχή", - "ευρό": "ευρώ", - "ευσπλαχνίζοµαι": "σπλαχνίζοµαι", - "εφεντης": "αφέντης", - "εφηµεριακός": "εφηµέριος", - "εφημεριακή": "εφηµέρια", - "εφημεριακό": "εφηµέριο", - "εφτά": "επτά", - "εφταετία": "επταετία", - "εφτακόσια": "επτακόσια", - "εφτακόσιοι": "επτακόσιοι", - "εφτακοσιοστός": "επτακοσιοστός", - "εχθές": "χθες", - "ζάπι": "ζάφτι", - "ζαχαριάζω": "ζαχαρώνω", - "ζαχαροµύκητας": "σακχαροµύκητας", - "ζεµανφού": "ζαµανφού", - "ζεµανφουτισµός": "ζαµανφουτισµός", - "ζέστα": "ζέστη", - "ζεύλα": "ζεύγλα", - "Ζηλανδία": "Νέα Ζηλανδία", - "ζήλεια": "ζήλια", - "ζιµπούλι": "ζουµπούλι", - "ζο": "ζώο", - "ζουρλαµάρα": "ζούρλα", - "ζωοφόρος": "ζωφόρος", - "ηλεκτροκόλληση": "ηλεκτροσυγκόλληση", - "ηλεκτροοπτική": "ηλεκτροπτική", - "ήλιο": "ήλιον", - "ηµιόροφος": "ηµιώροφος", - "θαλάµι": "θαλάµη", - "θάµα": "θαύµα", - "θαµπώνω": "θαµβώνω", - "θάµπος": "θάµβος", - "θάφτω": "θάβω", - "θεοψία": "θεοπτία", - "θέσει": "θέση", - "θηλειά": "θηλιά", - "Θόδωρος": "Θεόδωρος", - "θρύβω": "θρύπτω", - "θυµούµαι": "θυµάµαι", - "Ιαµάϊκή": "Τζαµάικα", - "ιατρεύω": "γιατρεύω", - "ιατρός": "γιατρός", - "ιατροσόφιο": "γιατροσόφι", - "I.Q.": "αϊ-κιού", - "ινατι": "γινάτι", - "ιονίζω": "ιοντίζω", - "ιονιστής": "ιοντιστής", - "ιονόσφαιρα": "ιοντόσφαιρα", - "Ιούλης": "Ιούλιος", - "ίσασµα": "ίσιωµα", - "ισιάζω": "ισιώνω", - "ίσκιος": "ήσκιος", - "ισκιώνω": "ησκιώνω", - "ίσωµα": "ίσιωµα", - "ισώνω": "ισιώνω", - "ιχθύαση": "ιχθύωση", - "ιώτα": "γιώτα", - "καββαλισµός": "καβαλισµός", - "κάβουρος": "κάβουρας", - "καδής": "κατής", - "καδρίλια": "καντρίλια", - "Καζακστάν": "Καζαχστάν", - "καθέκλα": "καρέκλα", - "κάθησα": "κάθισα", - "[1766]. καθίκι": "καθοίκι", - "καΐλα": "καήλα", - "καϊξής": "καϊκτσής", - "καλδέρα": "καλντέρα", - "καλεντάρι": "καλαντάρι", - "καλήν εσπέρα": "καλησπέρα", - "καλιά": "καλειά", - "καλιακούδα": "καλοιακούδα", - "κάλλια": "κάλλιο", - "καλλιά": "κάλλιο", - "καλόγηρος": "καλόγερος", - "καλόρχεται": "καλοέρχεται", - "καλσόν": "καλτσόν", - "καλυµµαύκι": "καµιλαύκι", - "καλύµπρα": "καλίµπρα", - "καλωσύνη": "καλοσύνη", - "καµαρωτός": "καµαρότος", - "καµηλαύκι": "καµιλαύκι", - "καµτσίκι": "καµουτσίκι", - "καναβάτσο": "κανναβάτσο", - "κανακίζω": "κανακεύω", - "κανάτα": "καννάτα", - "κανατάς": "καννατάς", - "κανάτι": "καννάτι", - "κανελής": "καννελής", - "κανελιά": "καννελή", - "κανελί": "καννελή", - "κανελονι": "καννελόνι", - "κανελλόνι": "καννελόνι", - "κανένας": "κανείς", - "κάνη": "κάννη", - "κανί": "καννί", - "κάνναβης": "κάνναβις", - "καννιβαλισµός": "κανιβαλισµός", - "καννίβαλος": "κανίβαλος", - "κανοκιάλι": "καννοκιάλι", - "κανόνι": "καννόνι", - "κανονιά": "καννονιά", - "κανονίδι": "καννονίδι", - "κανονιέρης": "καννονιέρης", - "κανονιοβολητής": "καννονιοβολητής", - "κανονιοβολισµός": "καννονιοβολισµός", - "κανονιοβολώ": "καννονιοβολώ", - "κανονιοστάσιο": "καννονιοστάσιο", - "κανονιοστοιχία": "καννονιοστοιχία", - "κανονοθυρίδα": "καννονοθυρίδα", - "κάνουλα": "κάννουλα", - "κανών": "κανόνας", - "κάπα": "κάππα", - "κάπαρη": "κάππαρη", - "καπαρντίνα": "καµπαρντίνα", - "καραβόσκοινο": "καραβόσχοινο", - "καρένα": "καρίνα", - "κάρκάδο": "κάκαδο", - "καροτίνη": "καρωτίνη", - "καρότο": "καρώτο", - "καροτόζουµο": "καρωτόζουµο", - "καροτοσαλάτα": "καρωτοσαλάτα", - "καρπούµαι": "καρπώνοµαι", - "καρρώ": "καρό", - "κάρυ": "κάρι", - "καρυοφύλλι": "καριοφίλι", - "καταΐφι": "κανταΐφι", - "κατακάθηµαι": "κατακάθοµαι", - "κατάντια": "κατάντηµα", - "κατασκοπεία": "κατασκοπία", - "καταφτάνω": "καταφθάνω", - "καταχράσθηκα": "καταχράστηκα", - "κατάχτηση": "κατάκτηση", - "καταχτητής": "κατακτητής", - "καταχτώ": "κατακτώ", - "καταχωρώ": "καταχωρίζω", - "κατέβαλα": "καταβάλλω", - "Κατερίνα": "Αικατερίνη", - "κατοστίζω": "εκατοστίζω", - "κάτου": "κάτω", - "κατρουλιό": "κατουρλιό", - "καυναδίζω": "καβγαδίζω", - "καϋµός": "καηµός", - "'κεί": "εκεί", - "κείθε": "εκείθε", - "καψόνι": "καψώνι", - "καψύλλιο": "καψούλι", - "κελάρης": "κελλάρης", - "κελί": "κελλί", - "κεντήτρια": "κεντήστρα", - "κεσέµι": "γκεσέµι", - "κέσιο": "καίσιο", - "κηπάριο": "κήπος", - "κινάρα": "αγκινάρα", - "κιοφτές": "κεφτές", - "κλαίγω": "κλαίω", - "κλαπάτσα": "χλαπάτσα", - "κλασσικίζω": "κλασικίζω", - "κλασσικιστής": "κλασικιστής", - "κλέπτης": "κλέφτης", - "κληθρα": "σκλήθρα", - "κλήρινγκ": "κλίρινγκ", - "κλιπ": "βιντεοκλίπ", - "κλωσά": "κλώσσα", - "κλωτσιά": "κλοτσιά", - "κογκλάβιο": "κονκλάβιο", - "κογκρέσο": "κονγκρέσο", - "κοιµίσης": "κοίµησης", - "κοιµούµαι": "κοιµάµαι", - "κοιτώ": "κοιτάζω", - "κοιτάω": "κοιτάζω", - "κόκαλο": "κόκκαλο", - "κοκίτης": "κοκκύτης", - "κοκκίαση": "κοκκίωση", - "κοκκοφοίνικας": "κοκοφοίνικας", - "κολάζ": "κολλάζ", - "κολαντρίζω": "κουλαντρίζω", - "κολαρίζω": "κολλαρίζω", - "κολεχτίβα": "κολεκτίβα", - "κολεχτιβισµός": "κολεκτιβισµός", - "κολιγιά": "κολληγιά", - "κολίγος": "κολλήγας", - "κολίγας": "κολλήγας", - "κολικόπονος": "κωλικόπονος", - "κολιός": "κολοιός", - "κολιτσίνα": "κολτσίνα", - "κολυµπήθρα": "κολυµβήθρα", - "κολώνα": "κολόνα", - "κολώνια": "κολόνια", - "κοµβόι": "κονβόι", - "κόµις": "κόµης", - "κόµισσα": "κόµης", - "κόµιτας": "κόµης", - "κοµιτεία": "κοµητεία", - "κόµµατα": "κοµµάτι", - "κοµµούνα": "κοµούνα", - "κοµµουναλισµός": "κοµουναλισµός", - "κοµµούνι": "κοµούνι", - "κοµµουνίζω": "κοµουνίζω", - "κοµµουνισµός": "κοµουνισµός", - "κοµµουνιστής": "κοµουνιστής", - "κονδυλοειδής": "κονδυλώδης", - "κονδυλοειδές": "κονδυλώδες", - "κονσέρτο": "κοντσέρτο", - "κόντραµπαντιέρης": "κοντραµπατζής", - "κοντσίνα": "κολτσίνα", - "κονφορµισµός": "κοµφορµισµός", - "κονφορµιστής": "κομφορμιστής", - "κοπελιά": "κοπέλα", - "κοπλιµέντο": "κοµπλιµέντο", - "κόπτω": "κόβω", - "κόπυραιτ": "κοπιράιτ", - "Κοριτσα": "Κορυτσά", - "κοριτσόπουλο": "κορίτσι", - "κορνέτο": "κορνέτα", - "κορνιζώνω": "κορνιζάρω", - "κορόιδεµα": "κοροϊδία", - "κορόνα": "κορώνα", - "κορφή": "κορυφή", - "κοσάρι": "εικοσάρικο", - "κοσάρικο": "εικοσάρικο", - "κοσµετολογία": "κοσµητολογία", - "κοτάω": "κοτώ", - "κουβαρνταλίκι": "χουβαρνταλίκι", - "κουβαρντάς": "χουβαρντάς", - "κουβερνάντα": "γκουβερνάντα", - "κούκος": "κούκκος", - "κουλλουρτζής": "κουλλουράς", - "κουλούρας": "κουλλουράς", - "κουλούρι": "κουλλούρι", - "κουλουριάζω": "κουλλουριάζω", - "κουλουρτζής": "κουλλουράς", - "κουρδιστής": "χορδιστής", - "κουρντιστής": "χορδιστής", - "κουρντίζω": "κουρδίζω", - "κουρντιστήρι": "κουρδιστήρι", - "κουστούµι": "κοστούµι", - "κουτεπιέ": "κουντεπιέ", - "κόφτης": "κόπτης", - "κόχη": "κόγχη", - "κοψοχείλης": "κοψαχείλης", - "κρεµάζω": "κρεµώ", - "κροντήρι": "κρωντήρι", - "κροµµύδι": "κρεµµύδι", - "κροµµυδίλα": "κρεµµυδίλα", - "κρουσταλλιάζω": "κρυσταλλιάζω", - "κτένα": "χτένα", - "κτενάκι": "χτενάκι", - "κτένι": "χτένι", - "κτενίζω": "χτενίζω", - "κτένισµα": "χτένισµα", - "κτίριο": "κτήριο", - "κυλίω": "κυλώ", - "κυττάζω": "κοιτάζω", - "κωλ-γκέρλ": "κολ-γκέρλ", - "κωλοµπαράς": "κολοµπαράς", - "κωσταντινάτο": "κωνσταντινάτο", - "Κώστας": "Κωνσταντίνος", - "κώχη": "κόγχη", - "λάβδα": "λάµβδα", - "λαγούτο": "λαούτο", - "λαγύνι": "λαγήνι", - "λαίδη": "λέδη", - "λαϊκάντζα": "λαϊκούρα", - "λαιµά": "λαιµός", - "λαΐνι": "λαγήνι", - "λαµπράδα": "λαµπρότητα", - "λάρος": "γλάρος", - "λατόµι": "λατοµείο", - "λαύδανο": "λάβδανο", - "λαυράκι": "λαβράκι", - "λαφίνα": "ελαφίνα", - "λαφόπουλο": "ελαφόπουλο", - "λειβάδι": "λιβάδι", - "Λειβαδιά": "Λιβάδια", - "λεϊµόνι": "λεµόνι", - "λεϊµονιά": "λεµονιά", - "Λειψία": "Λιψία", - "λέοντας": "λέων", - "λεπτά": "λεφτά", - "λεπτύνω": "λεπταίνω", - "λευκαστής": "λευκαντής", - "Λευτέρης": "Ελευθέριος", - "λευτερώνω": "ελευθερώνω", - "λέω": "λέγω", - "λιανεµπόριο": "λειανεµπόριο", - "λιανίζω": "λειανίζω", - "λιανοτούφεκο": "λειανοτούφεκο", - "λιανοντούφεκο": "λειανοντούφεκο", - "λιανοπούληµα": "λειανοπούληµα", - "λιανοπωλητής": "λειανοπωλητής", - "λιανοτράγουδο": "λειανοτράγουδο", - "λιγοψυχία": "ολιγοψυχία", - "λιθρίνι": "λυθρίνι", - "λιµένας": "λιµάνι", - "λίµπρα": "λίβρα", - "λιοβολιά": "ηλιοβολία", - "λιόδεντρο": "ελαιόδεντρο", - "λιόλαδο": "ελαιόλαδο", - "λιόσπορος": "ηλιόσπορος", - "λιοτρίβειο": "ελαιοτριβείο", - "λιοτρόπι": "ηλιοτρόπιο", - "λιόφως": "ηλιόφως", - "λιχουδιά": "λειχουδιά", - "λιώνω": "λειώνω", - "λογιωτατίζω": "λογιοτατίζω", - "λογιώτατος": "λογιότατος", - "λόγκος": "λόγγος", - "λόξιγκας": "λόξυγγας", - "λοτόµος": "υλοτόµος", - "Λουµπλιάνα": "Λιουµπλιάνα", - "λούω": "λούζω", - "λύγξ": "λύγκας", - "λυµφατισµός": "λεµφατισµός", - "λυντσάρω": "λιντσάρω", - "λυσσιακό": "λυσσακό", - "λυώνω": "λειώνω", - "Λωξάντρα": "Λοξάντρα", - "λωρένσιο": "λορένσιο", - "λωρίδα": "λουρίδα", - "µαγγάνιο": "µαγκάνιο", - "µαγγιώρος": "µαγκιόρος", - "µαγειριά": "µαγεριά", - "µάγειρος": "µάγειρας", - "µόγερας": "µάγειρας", - "µαγιώ": "µαγιό", - "µαγκανοπήγαδο": "µαγγανοπήγαδο", - "µαγκώνω": "µαγγώνω", - "µαγνόλια": "µανόλια", - "Μαγυάρος": "Μαγιάρος", - "µαζύ": "µαζί", - "µαζώνω": "µαζεύω", - "µαιζονέτα": "µεζονέτα", - "µαιτρ": "µετρ", - "µαιτρέσα": "µετρέσα", - "µακριός": "µακρύς", - "μακριά": "µακρυά", - "μακριό": "µακρύ", - "µαλάσσω": "µαλάζω", - "µαµά": "µαµµά", - "µαµouδι": "µαµούνι", - "µάνα": "µάννα", - "µανδαρινέα": "µανταρινιά", - "µανδήλι": "µαντήλι", - "µάνδρα": "µάντρα", - "µανές": "αµανές", - "Μανόλης": "Εµµανουήλ", - "µαντζούνι": "µατζούνι", - "µαντζουράνα": "µατζουράνα", - "µαντίλα": "µαντήλα", - "µαντίλι": "µαντήλι", - "µαντµαζέλ": "µαµαζέλ", - "µαντρίζω": "µαντρώνω", - "µαντώ": "µαντό", - "Μανώλης": "Εµµανουήλ", - "µάρτυς": "µάρτυρας", - "µασκάλη": "µασχάλη", - "µατοκυλίζω": "αιµατοκυλίζω", - "µατοκύλισµα": "αιµατοκυλίζω", - "µατσέτα": "µασέτα", - "µαυράδα": "µαυρίλα", - "μεγαλόπολη": "µεγαλούπολη", - "µεγαλοσπληνία": "σπληνοµεγαλία", - "µέγγενη": "µέγκενη", - "μείκτης": "µίκτης", - "µελίγγι": "µηλίγγι", - "µεντελισµός": "µενδελισµός", - "µενχίρ": "µενίρ", - "µέρα": "ηµέρα", - "µεράδι": "µοιράδι", - "µερεύω": "ηµερεύω", - "µέρµηγκας": "µυρµήγκι", - "µερµήγκι": "µυρµήγκι", - "µερσίνα": "µυρσίνη", - "µερσίνη": "µυρσίνη", - "µέρωµα": "ηµερώνω", - "µερώνω": "ηµερώνω", - "µέσον": "µέσο", - "µεσοούρανα": "µεσούρανα", - "µεταλίκι": "µεταλλίκι", - "µεταπούληση": "µεταπώληση", - "µεταπουλω": "µεταπωλώ", - "µετοχιάριος": "µετοχάρης", - "µητάτο": "µιτάτο", - "µητριά": "µητρυιά", - "µητριός": "µητρυιός", - "Μιανµάρ": "Μυανµάρ", - "Μίκι Μάους": "Μίκυ Μάους", - "µικρύνω": "µικραίνω", - "µινουέτο": "µενουέτο", - "µιξοπαρθένα": "µειξοπαρθένα", - "µισοφόρι": "µεσοφόρι", - "µίτζα": "µίζα", - "µολογώ": "οµολογώ", - "μολογάω": "οµολογώ", - "µοµία": "µούµια", - "µοµιοποίηση": "µουµιοποίηση", - "µονάρχιδος": "µόνορχις", - "µονιάζω": "µονοιάζω", - "µορφιά": "οµορφιά", - "µορφονιός": "οµορφονιός", - "µοσκάρι": "µοσχάρι", - "µοσκοβολιά": "µοσκοβολιά", - "µοσκοβολώ": "µοσχοβολώ", - "µοσκοκαρυδιά": "µοσχοκαρυδιά", - "µοσκοκάρυδο": "µοσχοκάρυδο", - "µοσκοκάρφι": "µοσχοκάρφι", - "µοσκολίβανο": "µοσχολίβανο", - "µοσκοµπίζελο": "µοσχοµπίζελο", - "µοσκοµυρίζω": "µοσχοµυρίζω", - "µοσκοπουλώ": "µοσχοπουλώ", - "µόσκος": "µόσχος", - "µοσκοσάπουνο": "µοσχοσάπουνο", - "µοσκοστάφυλο": "µοσχοστάφυλο", - "µόσχειος": "µοσχαρήσιος", - "μόσχειο": "µοσχαρήσιο", - "µουλώνω": "µουλαρώνω", - "µουρταδέλα": "µορταδέλα", - "µουσικάντης": "µουζικάντης", - "µουσσώνας": "µουσώνας", - "µουστάκα": "µουστάκι", - "µουστακοφόρος": "µυστακοφόρος", - "µπαγάζια": "µπαγκάζια", - "πάγκα": "µπάνκα", - "µπαγκαδορος": "µπανκαδόρος", - "µπογκέρης": "µπανκέρης", - "µπάγκος": "πάγκος", - "µπαιν-µαρί": "µπεν-µαρί", - "µπαλάντα": "µπαλλάντα", - "µπαλαντέζα": "µπαλλαντέζα", - "µπαλαντέρ": "µπαλλαντέρ", - "µπαλάντζα": "παλάντζα", - "µπαλένα": "µπαλαίνα", - "µπαλέτο": "µπαλλέτο", - "µπάλος": "µπάλλος", - "µπάλσαµο": "βάλσαµο", - "µπαλσάµωµα": "βαλσάµωµα", - "µπαλσαµώνω": "βαλσαµώνω", - "µπάλωµα": "µπάλλωµα", - "µπαλώνω": "µπαλλώνω", - "µπαµπάκι": "βαµβάκι", - "µπαµπακόσπορος": "βαµβακόσπορος", - "Μπάµπης": "Χαραλάµπης", - "µπάµπω": "βάβω", - "µπανέλα": "µπαναίλα", - "µπαρµπρίζ": "παρµπρίζ", - "µπατίστα": "βατίστα", - "µπαχτσές": "µπαξές", - "µπαχτσίσι": "µπαξίσι", - "µπεζεβέγκης": "πεζεβέγκης", - "µπελτές": "πελτές", - "µπεντόνι": "µπιντόνι", - "µπερδουκλώνω": "µπουρδουκλώνω", - "µπερκέτι": "µπερεκέτι", - "µπετόνι": "µπιτόνι", - "µπεχαβιορισµός": "µπιχεβιορισµός", - "µπεχλιβάνης": "πεχλιβάνης", - "µπιγκουτί": "µπικουτί", - "µπιµπίλα": "µπιρµπίλα", - "µπιµπλό": "µπιµπελό", - "µπιρσίµι": "µπρισίµι", - "µπις": "µπιζ", - "µπιστόλα": "πιστόλα", - "µπιστόλι": "πιστόλι", - "µπιστολιά": "πιστολιά", - "µπιτόνι": "µπιντόνι", - "µπογιάρος": "βογιάρος", - "µπονάτσα": "µπουνάτσα", - "µπονατσάρει": "µπουνατσάρει", - "µπουά": "µποά", - "µπουκαµβίλια": "βουκαµβίλια", - "µποϋκοταζ": "µποϊκοτάζ", - "µποϋκοτάρω": "µποϊκοτάρω", - "µπουλβάρ": "βουλεβάρτο", - "µπουρδέλο": "µπορντέλο", - "µπουρµπουάρ": "πουρµπουάρ", - "µπρίζα": "πρίζα", - "µπριτζόλα": "µπριζόλα", - "µπρος": "εµπρός", - "µπύρα": "µπίρα", - "µπυραρία": "µπιραρία", - "µπυροποσία": "µπιροποσία", - "µυγδαλιά": "αµυγδαλιά", - "µύγδαλο": "αµύγδαλο", - "µυλόρδος": "µιλόρδος", - "μυρουδιά": "µυρωδιά", - "µυτζήθρα": "µυζήθρα", - "µύωψ": "µύωπας", - "µώλος": "µόλος", - "νέθω": "γνέθω", - "νι": "νυ", - "νίκελ": "νικέλιο", - "νοµεύς": "νοµέας", - "νοστιµίζω": "νοστιµεύω", - "νουννός": "νοννός", - "νταβάνι": "ταβάνι", - "ντάβανος": "τάβανος", - "νταβανόσκουπα": "ταβανόσκουπα", - "νταβούλι": "νταούλι", - "νταλαβέρι": "νταραβέρι", - "νταµπλάς": "ταµπλάς", - "ντελαπάρω": "ντεραπάρω", - "ντενεκές": "τενεκές", - "ντερβεναγος": "δερβέναγας", - "ντερβένι": "δερβένι", - "ντερβίσης": "δερβίσης", - "ντερβισόπαιδο": "δερβισόπαιδο", - "ντοκυµανταίρ": "ντοκιµαντέρ", - "ντουνρού": "ντογρού", - "ντουζ": "ντους", - "ντουζιέρα": "ντουσιέρα", - "Ντούµα": "∆ούµα", - "ντούπλεξ": "ντούµπλεξ", - "ντουφέκι": "τουφέκι", - "ντουφεκίδι": "τουφεκίδι", - "ντουφεκίζω": "τουφεκίζω", - "ντουφεξής": "τουφεξής", - "νύκτα": "νύχτα", - "νυκτωδία": "νυχτωδία", - "νωµατάρχης": "ενωµοτάρχης", - "ξανεµίζω": "εξανεµίζω", - "ξεγνοιάζω": "ξενοιάζω", - "ξεγνοιασιά": "ξενοιασιά", - "ξελαφρώνω": "ξαλαφρώνω", - "ξεπίτηδες": "επίτηδες", - "ξεπιτούτου": "εξεπιτούτου", - "ξεσκάζω": "ξεσκάω", - "ξεσπάζω": "ξεσπώ", - "ξεσχίζω": "ξεσκίζω", - "ξέσχισµα": "ξεσκίζω", - "ξευτελίζω": "εξευτελίζω", - "ξεφτίζω": "ξεφτύζω", - "ξεφτίλα": "ξευτίλα", - "ξεφτίλας": "ξευτίλας", - "ξεφτιλίζω": "ξευτιλίζω", - "ξεχάνω": "ξεχνώ", - "ξηγώ": "εξηγώ", - "ξηροφαγία": "ξεροφαγία", - "ξηροφαγιά": "ξεροφαγία", - "ξι": "ξει", - "ξιπασιά": "ξυπασιά", - "ξίπασµα": "ξύπασµα", - "ξιπολησιά": "ξυπολυσιά", - "ξιπολιέµαι": "ξυπολιέµαι", - "εξοµολόγηση": "ξομολόγηση", - "ξοµολογητής": "εξοµολογητής", - "ξοµολόγος": "εξοµολόγος", - "ξοµολογώ": "εξοµολογώ", - "ξουράφι": "ξυράφι", - "ξουράφια": "ξυραφιά", - "ξόφληση": "εξόφληση", - "ξύγγι": "ξίγγι", - "ξύγκι": "ξίγγι", - "ξύδι": "ξίδι", - "ξυλοσκίστης": "ξυλοσχίστης", - "ξυλώνω": "ξηλώνω", - "ξυνωρίδα": "συνωρίδα", - "ξώθυρα": "εξώθυρα", - "ξώπορτα": "εξώπορτα", - "ξώφυλλο": "εξώφυλλο", - "οδοντογιατρός": "οδοντίατρος", - "οδοντόπονος": "πονόδοντος", - "οικογενειακά": "οικογενειακώς", - "οικοκυρά": "νοικοκυρά", - "οκτάς": "οκτάδα", - "οκταετής": "οχταετής", - "οκταετές": "οχταετές", - "οκταετία": "οχταετία", - "οµοιάζω": "µοιάζω", - "οµοιώνω": "εξοµοιώνω", - "οµόµετρο": "ωµόµετρο", - "οµορφάδα": "οµορφιά", - "οµπρός": "εµπρός", - "ονείρεµα": "όνειρο", - "οξείδιο": "οξίδιο", - "οξειδοαναγωγή": "οξιδοαναγωγή", - "οξειδώνω": "οξιδώνω", - "οξείδωση": "οξίδωση", - "οξειδωτής": "οξιδωτής", - "οξιζενέ": "οξυζενέ", - "οπίσω": "πίσω", - "οργιά": "οργυιά", - "όρνεο": "όρνιο", - "όρνις": "όρνιθα", - "ορρός": "ορός", - "όσµωση": "ώσµωση", - "οστεΐτιδα": "οστίτιδα", - "οστεογονία": "οστεογένεση", - "οφίτσιο": "οφίκιο", - "οφφίκιο": "οφίκιο", - "οχτάβα": "οκτάβα", - "οχτάδα": "οκτάδα", - "οχταετία": "οκταετία", - "οχτακόσια": "οκτακόσια", - "οχτακόσιοι": "οκτακόσιοι", - "οχτακόσιες": "οκτακόσιες", - "οχτακόσια": "οκτακόσια", - "όχτρητα": "έχθρητα", - "οχτώ": "οκτώ", - "Οχτώβρης": "Οκτώβριος", - "οψιανός": "οψιδιανός", - "παγαίνω": "πηγαίνω", - "παγόνι": "παγώνι", - "παιγνίδι": "παιχνίδι", - "παίδαρος": "παίδαρος", - "παίχτης": "παίκτης", - "παλικαράς": "παλληκαράς", - "παλικάρι": "παλληκάρι", - "παλικαριά": "παλληκαριά", - "παλικαροσύνη": "παλληκαροσύνη", - "παλληκαρίστίκος": "παλληκαρήσιος", - "παλληκαρίστικη": "παλληκαρήσια", - "παλληκαρίστικο": "παλληκαρήσιο", - "παλληκαροσύνη": "παλληκαριά", - "πανταλόνι": "παντελόνι", - "παντατίφ": "πανταντίφ", - "πανταχούσα": "απανταχούσα", - "Πάντοβα": "Πάδοβα", - "παντούφλα": "παντόφλα", - "παντοχή": "απαντοχή", - "πανψυχισµός": "παµψυχισµός", - "πάνω": "επάνω", - "παπαδάκι": "παππαδάκι", - "παπαδαρειό": "παππαδαρειό", - "παπαδιά": "παππαδιά", - "παπαδοκόρη": "παππαδοκόρη", - "παπαδοκρατούµαι": "παππαδοκρατούµαι", - "παπαδολόι": "παππαδολόι", - "παπαδοπαίδι": "παππαδοπαίδι", - "παπαδοπούλα": "παππαδοπούλα", - "Παπαδόπουλο": "παππαδόπουλο", - "παπατζής": "παππατζής", - "παπατρέχας": "παππατρέχας", - "παραγιάς": "παραγυιός", - "παρανυχίδα": "παρωνυχίδα", - "παρεισφρύω": "παρεισφρέω", - "παρεννοώ": "παρανοώ", - "παρ' ολίγο": "παραλίγο", - "πασαβιόλα": "µπασαβιόλα", - "πασάλειµµα": "πασσάλειµµα", - "πασαλείφω": "πασσαλείφω", - "πασκίζω": "πασχίζω", - "παστρουµάς": "παστουρµάς", - "πατερµά": "πατερηµά", - "πατήρ": "πατέρας", - "πατούνα": "πατούσα", - "πατριός": "πατρυιός", - "πάτρονας": "πάτρωνας", - "πάψη": "παύση", - "πεθυµώ": "επιθυµώ", - "πείρος": "πίρος", - "πελέκι": "πέλεκυς", - "πελεκίζω": "πελεκώ", - "πελλόγρα": "πελάγρα", - "πεντήκοντα": "πενήντα", - "πεντόβολα": "πεντόβωλα", - "πεντόδραχµο": "πεντάδραχµο", - "περβολάρης": "περιβολάρης", - "περβόλι": "περιβόλι", - "περδικλώνω": "πεδικλώνω", - "περηφανεύοµαι": "υπερηφανεύοµαι", - "περηφάνια": "υπερηφάνεια", - "περικόβω": "περικόπτω", - "περιπατώ": "περπατώ", - "περιστεριώνας": "περιστερώνας", - "περιτάµω": "περιτέµνω", - "περιφάνεια": "περηφάνια", - "περιφράζω": "περιφράσσω", - "περιχαράζω": "περιχαράσσω", - "περιχέω": "περιχύνω", - "περντάχι": "µπερντάχι", - "πέρπυρο": "υπέρπυρο", - "πέρσι": "πέρυσι", - "πετούγια": "µπετούγια", - "πευκιάς": "πευκώνας", - "πηγεµός": "πηγαιµός", - "πηγούνι": "πιγούνι", - "πήτα": "πίτα", - "πήχυς": "πήχης", - "πι": "πει", - "πιζάµα": "πιτζάµα", - "πιθαµή": "σπιθαµή", - "πιθώνω": "απιθώνω", - "πίκρισµα": "πικρίζω", - "πιλαλώ": "πηλαλώ", - "Πιλάτος": "Πόντιος Πιλάτος", - "πιοτό": "ποτό", - "πιπίζω": "πιππίζω", - "πιρέξ": "πυρέξ", - "πίστοµα": "απίστοµα", - "πιτσιλάδα": "πιτσυλάδα", - "πιτσιλιά": "πιτσυλιά", - "πίττα": "πίτα", - "πίτυρον": "πίτουρο", - "πλάγι": "πλάι", - "πλανάρω": "πλανίζω", - "πλάσσω": "πλάθω", - "πλειονοψηφία": "πλειοψηφία", - "πλείονοψηφώ": "πλειοψηφώ", - "πλεξίδα": "πλεξούδα", - "πλερωµή": "πληρωµή", - "πλερώνω": "πληρώνω", - "πλέυ µπόυ": "πλεϊµπόι", - "πλέχτης": "πλέκτης", - "πληµµύρα": "πληµύρα", - "πνιγµός": "πνίξιµο", - "πνευµονόκοκκος": "πνευµονιόκοκκος", - "ποιµήν": "ποιµένας", - "πόλις": "πόλη", - "πόλιτσµαν": "πόλισµαν", - "πολιτσµάνος": "πόλισµαν", - "πολύµπριζο": "πολύπριζο", - "πολυπάω": "πολυπηγαίνω", - "πολύπους": "πολύποδας", - "Πόρτο Ρίκο": "Πουέρτο Ρίκο", - "ποταπαγόρευση": "ποτοαπαγόρευση", - "πούντρα": "πούδρα", - "πράµα": "πράγµα", - "πρεβάζι": "περβάζι", - "πρέπον": "πρέπων", - "προαγάγω": "προάγω", - "προδίνω": "προδίδω", - "προιξ": "προίκα", - "προποτζής": "προπατζής", - "προσαγάγω": "προσάγω", - "πρόσµιξη": "πρόσµειξη", - "προσφύγω": "προσφεύγω", - "προφθάνω": "προφταίνω", - "προφυλάω": "προφυλάσσω", - "προψές": "προχθές", - "πρύµη": "πρύµνη", - "πταρνίζοµαι": "φταρνίζοµαι", - "πτελέα": "φτελιά", - "πτέρνα": "φτέρνα", - "πτερυγίζω": "φτερουγίζω", - "πτιφούρ": "πετιφούρ", - "πτι-φούρ": "πετιφούρ", - "πτωχαίνω": "φτωχαίνω", - "πτώχεια": "φτώχια", - "πυκνά": "πυκνός", - "πυλωτή": "πιλοτή", - "πύο": "πύον", - "πυρογενής": "πυριγενής", - "πυρογενές": "πυριγενές", - "πυτζάµα": "πιτζάµα", - "ραγκλόν": "ρεγκλάν", - "ραγού": "ραγκού", - "ραΐζω": "ραγίζω", - "ραίντνκεν": "ρέντγκεν", - "ράντζο": "ράντσο", - "ράπτω": "ράβω", - "ρεβανί": "ραβανί", - "ρέγγε": "ρέγκε", - "Ρεγγίνα": "Ρεγκίνα", - "ρεµούλκα": "ρυµούλκα", - "ασκέρι": "ασκέρι", - "ρεοβάση": "ρευµατοβάση", - "ρεπανάκι": "ραπανάκι", - "ρεπάνι": "ραπάνι", - "ρεύω": "ρέβω", - "ρήγα": "ρίγα", - "ρηµοκκλήσι": "ερηµοκκλήσι", - "ριγκ": "ρινγκ", - "ριζότο": "ρυζότο", - "ροβίθι": "ρεβίθι", - "ροβιθιά": "ρεβιθιά", - "ροδακινιά": "ρωδακινιά", - "ροδάκινο": "ρωδάκινο", - "ρόιδι": "ρόδι", - "ροϊδιά": "ροδιά", - "ρόιδο": "ρόδι", - "ροοστάτης": "ρεοστάτης", - "ροφώ": "ρουφώ", - "ρωδιός": "ερωδιός", - "ρωθωνίζω": "ρουθουνίζω", - "ρωµαντισµός": "ροµαντισµός", - "Ρωσσία": "Ρωσία", - "ρωτώ": "ερωτώ", - "σάζω": "σιάζω", - "σαιζλόνγκ": "σεζλόνγκ", - "σαιζόν": "σεζόν", - "σαγολαίφα": "σακολαίβα", - "σάκκα": "σάκα", - "σακκάκι": "σακάκι", - "σακκάς": "σακάς", - "σακκί": "σακί", - "σακκίδιο": "σακίδιο", - "σακκοβελόνα": "σακοβελόνα", - "σακκογκόλιθος": "σακογκόλιθος", - "σακκοειδής": "σακοειδής", - "σακκοειδές": "σακοειδες", - "σακκοράφα": "σακοράφα", - "σάκκος": "σάκος", - "σακκουλα": "σακούλα", - "σακκουλάκι": "σακούλι", - "σακκουλεύοµαι": "σακουλεύοµαι", - "σακκούλι": "σακούλι", - "σακκουλιάζω": "σακουλιάζω", - "σακχαροδιαβήτης": "ζαχαροδιαβήτης", - "σάκχαροκαλάµο": "ζαχαροκάλαµο", - "σακχαροποιία": "ζαχαροποιία", - "σακχαρότευτλον": "ζαχαρότευτλο", - "σαλιαρίστρα": "σαλιάρα", - "σαλπιστής": "σαλπιγκτής", - "σαντακρούτα": "σατακρούτα", - "σαντάλι": "σανδάλι", - "σάνταλο": "σανδάλι", - "σάρρα": "σάρα", - "σαφρίδι": "σαυρίδι", - "σαχάνι": "σαγάνι", - "σβολιάζω": "σβωλιάζω", - "σβώλιασμα": "σβόλιασµα", - "σβόλος": "σβώλος", - "σβύνω": "σβήνω", - "σγουρώνω": "σγουραίνω", - "σενκόντο": "σεκόντο", - "σεγκούνα": "σιγκούνα", - "σεγόντο": "σεκόντο", - "Σειληνός": "Σιληνός", - "σείρακας": "σείρικας", - "σειρήτι": "σιρίτι", - "σεκονταρω": "σιγοντάρω", - "σεγκοντάρω": "σιγοντάρω", - "σελιλόιντ": "σελουλόιντ", - "σέλλα": "σέλα", - "σεξπιριστής": "σαιξπηριστής", - "Σεράγεβο": "Σαράγεβο", - "σεστέτο": "σεξτέτο", - "σετέτο": "σεπτέτο", - "σέχτα": "σέκτα", - "σεχταρισµός": "σεκταρισµός", - "σηµαφόρος": "σηµατοφόρος", - "σήριαλ": "σίριαλ", - "σηψίνη": "σηπτίνη", - "σιγάρο": "τσιγάρο", - "σιγαροθήκη": "τσιγαροθήκη", - "σίγλος": "σίκλος", - "σιγόντο": "σεκόντο", - "Σίδνεϊ": "Σύδνεϋ", - "σίελος": "σίαλος", - "σινθεσάιζερ": "συνθεσάιζερ", - "σιντέφι": "σεντέφι", - "σιορ": "σινιόρ", - "σιρυΐάνι": "σεργιάνι", - "σιρµαγιά": "σερµαγιά", - "σίτα": "σήτα", - "σταρέµπορος": "σιτέµπορος", - "σκανδαλιά": "σκανταλιά", - "σκάνταλο": "σκάνδαλο", - "σκάπτω": "σκάβω", - "σκάρα": "σχάρα", - "σκαρµός": "σκαλµός", - "σκάφτω": "σκάβω", - "σκεβρώνω": "σκευρώνω", - "σκερπάνι": "σκεπάρνι", - "σκίζα": "σχίζα", - "σκίζω": "σχίζω", - "σκίνος": "σχίνος", - "σκίσιµο": "σχίσιµο", - "σκισµάδα": "σχισµάδα", - "σκισµή": "σχισµή", - "σκλήρωση": "σκλήρυνση", - "σκοινάκι": "σχοινάκι", - "σκονί": "σχοινί", - "σκοινί": "σχοινί", - "σκοίνος": "σχοίνος", - "σκολάω": "σχολώ", - "σκολειαρόπαιδο": "σχολειαρόπαιδο", - "σκολειαρούδι": "σχολειαρούδι", - "σκολειό": "σχολείο", - "σκόλη": "σχόλη", - "σκολιαρόπαιδο": "σχολειαρόπαιδο", - "σκολιαρούδι": "σχολειαρούδι", - "σκολιό": "σχολειό", - "σκολνώ": "σχολώ", - "σκολώ": "σχολώ", - "Σκοτία": "Σκωτία", - "σκότισµα": "σκοτισµός", - "Σκοτσέζος": "Σκωτσέζος", - "σκουντούφληµα": "σκουντούφλα", - "σκώληξ": "σκουλήκι", - "σκώτι": "συκώτι", - "σοβαντεπί": "σοβατεπί", - "σοβατίζω": "σοβαντίζω", - "σοροκολεβάντες": "σιροκολεβάντες", - "σορόκος": "σιρόκος", - "σοροπιάζω": "σιροπιάζω", - "σουβατίζω": "σοβαντίζω", - "σουβαντίζω": "σοβαντίζω", - "σουβάς": "σοβάς", - "σουβατεπί": "σοβαντεπί", - "σοβατεπί": "σοβαντεπί", - "σουµιέ": "σοµιέ", - "σούρσιµο": "σύρσιµο", - "σουσπασιόν": "σισπανσιόν", - "σοφεράρω": "σοφάρω", - "σπαής": "σπαχής", - "σπαράσσω": "σπαράζω", - "σπερµατσετο": "σπαρµατσέτο", - "σπερµίνη": "σπερµατίνη", - "σπερµοβλάστη": "σπερµατοβλάστη", - "σπερµογονία": "σπερµατογονία", - "σπερµοδότης": "σπερµατοδότης", - "σπερµοδόχος": "σπερµατοδόχος", - "σπερμοδόχο": "σπερµατοδόχο", - "σπερµοθήκη": "σπερµατοθήκη", - "σπερµοκτόνος": "σπερµατοκτόνος", - "σπερμοκτόνο": "σπερµατοκτόνο", - "σπερµοτοξίνη": "σπερµατοτοξίνη", - "σπερµοφάγος": "σπερµατοφάγος", - "σπερμοφάγο": "σπερµατοφάγο", - "σπερµοφόρος": "σπερµατοφόρος", - "σπερμοφόρο": "σπερµατοφόρο", - "σπινάρω": "σπινιάρω", - "σπιράλ": "σπειράλ", - "σπλάχνο": "σπλάγχνο", - "σπογγίζω": "σφουγγίζω", - "σπω": "σπάζω", - "Στάθης": "Ευστάθιος", - "στάλαµα": "στάλαγµα", - "σταλαµατιά": "σταλαγµατιά", - "σταλαξιά": "σταλαγµατιά", - "σταλίτσα": "σταλιά", - "σταρήθρα": "σιταρήθρα", - "στάρι": "σιτάρι", - "σταρότοπος": "σιταρότοπος", - "σταχολογώ": "σταχυολογώ", - "στειρεύω": "στερεύω", - "στειροποιώ": "στειρώνω", - "Στέλιος": "Στυλιανός", - "Στέλλα": "Στυλιανή", - "στεναχώρια": "στενοχώρια", - "στεναχωρώ": "στενοχωρώ", - "στένω": "στήνω", - "στέριωµα": "στερέωµα", - "στεριώνω": "στερεώνω", - "στέρξιµο": "στέργω", - "στιλ": "στυλ", - "στιλάκι": "στυλάκι", - "στιλιζάρω": "στυλιζάρω", - "στιλίστας": "στυλίστας", - "στιλό": "στυλό", - "στιφάδο": "στυφάδο", - "στορίζω": "ιστορώ", - "στόρισµα": "ιστόρηση", - "στραβοµάρα": "στραβωµάρα", - "στραγγουλίζω": "στραγγαλίζω", - "Στρατής": "Ευστράτιος", - "στρατί": "στράτα", - "στρατοποίηση": "στρατιωτικοποίηση", - "Στράτος": "Ευστράτιος", - "στρένω": "στέργω", - "στριµόκωλα": "στρυµόκωλα", - "στριµωξίδι": "στρυµωξίδι", - "στριµώχνω": "στρυµώχνω", - "στύβω": "στείβω", - "στυπώνω": "στουπώνω", - "σύγνεφο": "σύννεφο", - "συγνώµη": "συγγνώµη", - "συδαυλίζω": "συνδαυλίζω", - "συµπαρασέρνω": "συµπαρασύρω", - "συµπεθεριά": "συµπεθεριό", - "δεκαέξι": "δεκάξι", - "συνήθιο": "συνήθειο", - "συντάµω": "συντέµνω", - "συντριβάνι": "σιντριβάνι", - "συνυφάδα": "συννυφάδα", - "συφορά": "συµφορά", - "συχώρεση": "συγχώρηση", - "συχωρώ": "συγχωρώ", - "συχωροχάρτι": "συγχωροχάρτι", - "σφαλνώ": "σφαλίζω", - "σφεντάµι": "σφένδαµνος", - "σφερδούκλι": "σπερδούκλι", - "σφόνδυλος": "σπόνδυλος", - "σωβινισµός": "σοβινισµός", - "σωβινιστής": "σοβινιστής", - "σώνω": "σώζω", - "σωρείτης": "σωρίτης", - "σωτάρω": "σοτάρω", - "σωτέ": "σοτέ", - "Σωτήρης": "Σωτήριος", - "σωφέρ": "σοφέρ", - "ταβατούρι": "νταβαντούρι", - "ταβερνούλα": "ταβέρνα", - "ταβλάς": "ταµπλάς", - "ταγιαδόρος": "ταλιαδόρος", - "ταγίζω": "ταΐζω", - "τάγισµα": "τάισµα", - "ταγκό": "τανγκό", - "ταή": "ταγή", - "τάλαρο": "τάλιρο", - "τάλληρο": "τάλιρο", - "ταµίευση": "αποταµίευση", - "ταµιεύω": "αποταµιεύω", - "ταµώ": "τέµνω", - "ταξείδι": "ταξίδι", - "ταπεραµέντο": "ταµπεραµέντο", - "ταράσσω": "ταράζω", - "ταχτοποίηση": "τακτοποίηση", - "ταχτοποιώ": "τακτοποιώ", - "τελάλης": "ντελάλης", - "τελολογία": "τελεολογία", - "τεριρέµ": "τερερέµ", - "τερραίν": "τερέν", - "τέσσαρα": "τέσσερα", - "τετράς": "τετράδα", - "τζέντζερης": "τέντζερης", - "τζετζερέδια": "τεντζερέδια", - "τζιριτζάντζουλα": "τζυριτζάτζουλα", - "τζίρος": "τζύρος", - "τζιτζιµπίρα": "τσιτσιµπίρα", - "τηκ": "τικ", - "τηλοµοιοτύπηµα": "τηλεοµοιοτύπηµα", - "τηλοµοιοτυπία": "τηλεοµοιοτυπία", - "τηλοµοιοτυπώ": "τηλεοµοιοτυπώ", - "τιτιβίζω": "τιττυβίζω", - "τµήθηκα": "τέµνω", - "τµήσω": "τέµνω", - "Τόκιο": "Τόκυο", - "τοµάτα": "ντοµάτα", - "τοµατιά": "ντοµατιά", - "τοµατοπολτός": "ντοµατοπολτός", - "τοµατοσαλάτα": "ντοµατοσαλάτα", - "τονθορύζω": "υποτονθορύζω", - "τορβάς": "ντορβάς", - "τορνάρω": "τορνεύω", - "τορπίλα": "τορπίλη", - "τούνδρα": "τούντρα", - "Τουρκάλα": "Τούρκος", - "τράβαλα": "ντράβαλα", - "τραΐ": "τραγί", - "τραινάρισµα": "τρενάρισµα", - "τραινάρω": "τρενάρω", - "τραίνο": "τρένο", - "τρακόσοι": "τριακόσιοι", - "τραπεζάκι": "τραπέζι", - "τρέµουλο": "τρεµούλα", - "τρέψω": "τρέπω", - "τριάµισι": "τρεισήµισι", - "τρικλίζω": "τρεκλίζω", - "τρίκλισµα": "τρέκλισµα", - "τρίπλα": "ντρίπλα", - "τριπλαδόρος": "ντριπλαδόρος", - "τριπλάρω": "ντριπλάρω", - "τρίπους": "τρίποδας", - "τρόπις": "τρόπιδα", - "τρυκ": "τρικ", - "τσαγγαράδικο": "τσαγκαράδικο", - "τσογγάρης": "τσαγκάρης", - "τσαγγάρικο": "τσαγκάρικο", - "τσαγγαροδευτέρα": "τσαγκαροδευτέρα", - "τσάµπα": "τζάµπα", - "τσαµπατζής": "τζαµπατζής", - "τσαντίζω": "τσατίζω", - "τσαντίλα": "τσατίλα", - "τσαντίλας": "τσατίλας", - "τσάντισµα": "τσάτισµα", - "τσίβα": "τζίβα", - "τσίκλα": "τσίχλα", - "τσιµεντώνω": "τσιµεντάρω", - "τσιπούρα": "τσιππούρα", - "τσιρίζω": "τσυρίζω", - "τσιριτσάντζουλα": "τζιριτζάντζουλα", - "τσιρότο": "τσηρώτο", - "τσίτα": "τσήτα", - "τσιτσιρίζω": "τσυτσυρίζω", - "τσιτσίρισµα": "τσυτσυρίζω", - "τσίτωµα": "τσήτωµα", - "τσοµπάνος": "τσοµπάνης", - "τσοπάνης": "τσοµπάνης", - "τσοπανόπουλο": "τσοµπανόπουλο", - "τσοπάνος": "τσοµπάνης", - "τσύνορο": "τσίνορο", - "τυράγνισµα": "τυράννισµα", - "τυραγνω": "τυραννώ", - "τυφεκίζω": "τουφεκίζω", - "τυφεκισµός": "τουφεκισµός", - "υαλόχαρτον": "γυαλόχαρτο", - "υαλόχαρτο": "γυαλόχαρτο", - "υάρδα": "γιάρδα", - "ύβρη": "ύβρις", - "υδατοσκοπια": "υδροσκοπία", - "υδραέριο": "υδαταέριο", - "ύελος": "ύαλος", - "Υόρκη Νέα": "Νέα Υόρκη", - "υποδείχνω": "υποδεικνύω", - "υπόδεσις": "υπόδηση", - "υποκάµισο": "πουκάµισο", - "φαγκρί": "φαγγρί", - "φαγοκύτωση": "φαγοκυττάρωση", - "ψόγουσα": "φαγέδαινα", - "φαγωµός": "φαγωµάρα", - "φάδι": "υφάδι", - "φαινοµεναλισµός": "φαινοµενοκρατία", - "φαινοµενισµός": "φαινοµενοκρατία", - "φαίνω": "υφαίνω", - "φαλακρώνω": "φαλακραίνω", - "φαµίλια": "φαµελιά", - "φαµφάρα": "φανφάρα", - "φαµφαρονισµος": "φανφαρονισµός", - "φαµφαρόνος": "φανφαρόνος", - "φαράκλα": "φαλάκρα", - "φαρµασόνος": "φραµασόνος", - "φαρµπαλάς": "φραµπαλάς", - "φασουλάδα": "φασολάδα", - "φασουλάκια": "φασολάκια", - "φασουλιά": "φασολιά", - "φασούλι": "φασόλι", - "φελόνι": "φαιλόνιο", - "φελώ": "ωφελώ", - "φεουδαλισµός": "φεουδαρχισµός", - "φερµάνι": "φιρµάνι", - "φέτος": "εφέτος", - "φθήνια": "φτήνια", - "Φιλανδία": "Φινλανδία", - "φιλενάδα": "φιλαινάδα", - "φιλιστρίνι": "φινιστρίνι", - "φιλόφρονας": "φιλόφρων", - "φιντάνι": "φυντάνι", - "φιορντ": "φιόρδ", - "φίσκα": "φύσκα", - "φκειάνω": "φτειάχνω", - "φκιάνω": "φτειάχνω", - "φκειασιδι": "φτειασίδι", - "φκειασίδωµα": "φτειασίδωµα", - "φκειασιδώνω": "φτειασιδώνω", - "φκιασιδι": "φτειασίδι", - "φκιασίδωµα": "φτειασίδωµα", - "φκιασιδώνω": "φτειασιδώνω", - "φκυάρι": "φτυάρι", - "Φλάνδρα": "Φλαµανδία", - "φλισκούνι": "φλησκούνι", - "φλοίδα": "φλούδα", - "φλοµιάζω": "φλοµώνω", - "φλορίνι": "φιορίνι", - "φλυτζάνι": "φλιτζάνι", - "φοβούµαι": "φοβάµαι", - "φονεύς": "φονιάς", - "φόντα": "φόντο", - "φουσέκι": "φισέκι", - "φούχτα": "χούφτα", - "φουχτώνω": "χουφτώνω", - "Φραγκφούρτη": "Φρανκφούρτη", - "φράσσω": "φράζω", - "Φρίντα": "Φρειδερίκη", - "Φροσύνη": "Ευφροσύνη", - "Φρόσω": "Ευφροσύνη", - "φροϋδισµος": "φροϊδισµός", - "φρουµάζω": "φριµάζω", - "φρούµασµα": "φρίµασµα", - "φτάνω": "φθάνω", - "φταρνίζοµαι": "φτερνίζοµαι", - "φτειάνω": "φτειάχνω", - "φτηνά": "φθηνά", - "φτηναίνω": "φθηναίνω", - "φτιασίδι": "φτειασίδι", - "φτιασιδώνοµαι": "φτειασιδώνοµαι", - "φτωχοκοµείο": "πτωχοκοµείο", - "φυγάδας": "φυγάς", - "φύγω": "φεύγω", - "φυλάγω": "φυλάσσω", - "φυλλαράκι": "φύλλο", - "φυλλόδεντρο": "φιλόδεντρο", - "φυλώ": "φυλάσσω", - "φυσέκι": "φισέκι", - "φυσεκλίκι": "φισεκλίκι", - "φυσιοθεραπεία": "φυσικοθεραπεία", - "φυστίκι": "φιστίκι", - "φυστικιά": "φιστικιά", - "φύω": "φύοµαι", - "φχαριστώ": "ευχαριστώ", - "φωβισµός": "φοβισµός", - "φωβιστής": "φοβισµός", - "Φώτης": "Φώτιος", - "φωτογραφώ": "φωτογραφίζω", - "φωτοβολή": ", φωτοβολία", - "χάβω": "χάφτω", - "χαΐδεµα": "χαϊδεύω", - "χάιδι": "χάδι", - "χαλνώ": "χαλώ", - "χαλυβώνω": "χαλυβδώνω", - "χάµου": "χάµω", - "χαµψίνι": "χαµσίνι", - "χάνδρα": "χάντρα", - "χαντζής": "χανιτζής", - "χαραµατιά": "χαραγµατιά", - "χάραξ": "χάρακας", - "χάροντας": "χάρος", - "χατζάρα": "χαντζάρα", - "χατζάρι": "χαντζάρι", - "χεγκελιανισµός": "εγελιανισµός", - "χειρόβολο": "χερόβολο", - "χειροµάχηµα": "χεροµαχώ", - "χειροµάχισσα": "χεροµάχος", - "χειροµάχος": "χεροµάχος", - "χειροµαχώ": "χεροµαχώ", - "χέρα": "χέρι", - "χερόµυλος": "χειρόµυλος", - "χεροπόδαρα": "χειροπόδαρα", - "χηνάρι": "χήνα", - "χι": "χει", - "χιµώ": "χυµώ", - "χιών": "χιόνι", - "χλεµπάνια": "πλεµπάγια", - "χλοΐζω": "χλοάζω", - "χλόισµα": "χλόασµα", - "χνώτο": "χνότο", - "χορδίζω": "κουρδίζω", - "χόρδισµα": "κούρδισμα", - "χοχλάζω": "κοχλάζω", - "χοχλακιάζω": "κοχλάζω", - "χοχλακίζω": "κοχλάζω", - "χοχλακώ": "κοχλάζω", - "χρεογραφο": "χρεώγραφο", - "χρεοκοπία": "χρεωκοπία", - "χρεοκοπώ": "χρεωκοπώ", - "χρεολυσία": "χρεωλυσία", - "χρεολύσιο": "χρεωλύσιο", - "χρεόλυτρο": "χρεώλυτρο", - "χρεοπιστώνω": "πιστοχρεώνω", - "χρεοπίστωση": "πιστοχρεώνω", - "χρεοστάσιο": "χρεωστάσιο", - "χρεοφειλέτης": "χρεωφειλέτης", - "Χρήστος": "Χρίστος", - "χρωµατόσωµα": "χρωµόσωµα", - "χρωµογόνος": "χρωµατογόνος", - "χρωµογόνο": "χρωµατογόνο", - "χρωµοφόρος": "χρωµατοφόρος", - "χρωµοφόρο": "χρωµατοφόρο", - "χτες": "χθες", - "χτήµα": "κτήµα", - "χτίζω": "κτίζω", - "χτίσιµο": "κτίσιµο", - "χτίσµα": "κτίσµα", - "χτίστης": "κτίστης", - "χτύπηµα": "κτύπηµα", - "χτύπος": "κτύπος", - "χτυπώ": "κτυπώ", - "χυµίζω": "χυµώ", - "χωλ": "χολ", - "χώνεψη": "χώνευση", - "χωριατοσύνη": "χωριατιά", - "ψένω": "ψήνω", - "ψηλαφώ": "ψηλαφίζω", - "ψηφιδοθέτης": "ψηφοθέτης", - "ψιττακίαση": "ψιττάκωση", - "ψίχαλο": "ψίχουλο", - "ψυχεδελισµός": "ψυχεδέλεια", - "ψυχογιός": "ψυχογυιός", - "ψώριασµα": "ψωριάζω", - "ωγκρατέν": "ογκρατέν", - "ωράριο": "οράριο", - "ώς": "έως", - "ωτασπίδα": "ωτοασπίδα", - "ωτοστόπ": "οτοστόπ", - "ωφελιµοκρατία": "ωφελιµισµός", - "ωχαδερφισµός": "οχαδερφισµός", - "ώχου": "όχου", - "άγυρτος": "άγειρτος", - "άγυρτη": "άγειρτη", - "άγυρτο": "άγειρτο", - "ανηµέρευτος": "ανηµέρωτος", - "ανηµέρευτη": "ανηµέρωτη", - "ανηµέρευτο": "ανηµέρωτο", - "ανοικτός": "ανοιχτός", - "ανοικτή": "ανοιχτή", - "ανοικτό": "ανοιχτό", - "αντιελληνικός": "ανθελληνικός", - "αντιελληνική": "ανθελληνική", - "αντιελληνικό": "ανθελληνικό", - "αντιεπιστηµονικος": "αντεπιστηµονικός", - "αντιεπιστηµονικη": "αντεπιστηµονική", - "αντιεπιστηµονικο": "αντεπιστηµονικό", - "αξόφλητος": "ανεξόφλητος", - "αξόφλητη": "ανεξόφλητη", - "αξόφλητο": "ανεξόφλητο", - "άπαιχτος": "άπαικτος", - "άπαιχτη": "άπαικτη", - "άπαιχτο": "άπαικτο", - "απηρχαιωµένος": "απαρχαιωµένος", - "απηρχαιωµένη": "απαρχαιωµένη", - "απηρχαιωµένο": "απαρχαιωµένο", - "άπιωτος": "άπιοτος", - "άπιωτη": "άπιοτη", - "άπιωτο": "άπιοτο", - "άπραχτος": "άπρακτος", - "άπραχτη": "άπρακτη", - "άπραχτο": "άπρακτο", - "άραχλος": "άραχνος", - "άραχλη": "άραχνη", - "άραχλο": "άραχνο", - "αρήγωτος": "αρίγωτος", - "αρήγωτη": "αρίγωτη", - "αρήγωτο": "αρίγωτο", - "αρµενικός": "αρµενιακός", - "αρµενική": "αρµενιακή", - "αρµενικό": "αρµενιακό", - "αρµυρός": "αλµυρός", - "αρµυρή": "αλµυρή", - "αρµυρό": "αλµυρό", - "άσβεστος": "άσβηστος", - "άσβεστη": "άσβηστη", - "άσβεστο": "άσβηστο", - "άσκηµος": "άσχηµος", - "άσκηµη": "άσχηµη", - "άσκηµο": "άσχηµο", - "άστυφτος": "άστειφτος", - "άστυφτη": "άστειφτη", - "άστυφτο": "άστειφτο", - "ασυχώρετος": "ασυγχώρητος", - "ασυχώρετη": "ασυγχώρητη", - "ασυχώρετο": "ασυγχώρητο", - "άταχτος": "άτακτος", - "άταχτη": "άτακτη", - "άταχτο": "άτακτο", - "άφκιαστος": "άφτειαχτος", - "άφκιαστη": "άφτειαχτη", - "άφκιαστο": "άφτειαχτο", - "άφκειαστος": "άφτειαχτος", - "άφκειαστη": "άφτειαχτη", - "άφκειαστο": "άφτειαχτο", - "άφταστος": "άφθαστος", - "άφταστη": "άφθαστη", - "άφταστο": "άφθαστο", - "άφτερος": "άπτερος", - "άφτερη": "άπτερη", - "άφτερο": "άπτερο", - "αχτιδωτος": "ακτινωτός", - "αχτιδωτη": "ακτινωτή", - "αχτιδωτο": "ακτινωτό", - "άχτιστος": "άκτιστος", - "άχτιστη": "άκτιστη", - "άχτιστο": "άκτιστο", - "βιωτικός": "βιοτικός", - "βιωτική": "βιοτική", - "βιωτικό": "βιοτικό", - "βλάστηµος": "βλάσφηµος", - "βλάστηµη": "βλάσφηµη", - "βλάστηµο": "βλάσφηµο", - "βλογηµένος": "ευλογηµένος", - "βλογηµένη": "ευλογηµένη", - "βλογηµένο": "ευλογηµένο", - "βοϊδινός": "βοδινός", - "βοϊδινή": "βοδινή", - "βοϊδινό": "βοδινό", - "βορινός": "βορεινός", - "βορινή": "βορεινή", - "βορινό": "βορεινό", - "βρωµερός": "βροµερός", - "βρωµερή": "βροµερή", - "βρωµερό": "βροµερό", - "βρώµικος": "βρόµικος", - "βρώµικη": "βρόµικη", - "βρώµικο": "βρόµικο", - "γαλατερός": "γαλακτερός", - "γαλατερή": "γαλακτερή", - "γαλατερό": "γαλακτερό", - "γδυµνός": "γυµνός", - "γδυµνή": "γυµνή", - "γδυµνό": "γυµνό", - "γελαδινός": "αγελαδινός", - "γελαδινή": "αγελαδινή", - "γελαδινό": "αγελαδινό", - "γερτός": "γειρτός", - "γερτή": "γειρτή", - "γερτό": "γειρτό", - "γιοµάτος": "γεµάτος", - "γιοµάτη": "γεµάτη", - "γιοµάτο": "γεµάτο", - "γκεµπελικός": "γκαιµπελικός", - "γκεµπελική": "γκαιµπελική", - "γκεµπελικό": "γκαιµπελικό", - "γλήγορος": "γρήγορος", - "γλήγορη": "γρήγορη", - "γλήγορο": "γρήγορο", - "γρανίτινος": "γρανιτένιος", - "γρανίτινη": "γρανιτένιη", - "γρανίτινο": "γρανιτένιο", - "γραφτός": "γραπτός", - "γραφτή": "γραπτή", - "γραφτό": "γραπτό", - "γυρτός": "γειρτός", - "γυρτή": "γειρτή", - "γυρτό": "γειρτό", - "δαιµονόπληκτος": "δαιµονιόπληκτος", - "δαιµονόπληκτη": "δαιµονιόπληκτη", - "δαιµονόπληκτο": "δαιµονιόπληκτο", - "δερµικός": "δερµατικός", - "δερµική": "δερµατική", - "δερµικό": "δερµατικό", - "δεχτός": "δεκτός", - "δεχτή": "δεκτή", - "δεχτό": "δεκτό", - "διαλεκτός": "διαλεχτός", - "διαλεκτή": "διαλεχτή", - "διαλεκτό": "διαλεχτό", - "διαολεµένος": "διαβολεµένος", - "διαολεµένη": "διαβολεµένη", - "διαολεµένο": "διαβολεµένο", - "δυσέλεγκτος": "δυσεξέλεγκτος", - "δυσέλεγκτη": "δυσεξέλεγκτη", - "δυσέλεγκτο": "δυσεξέλεγκτο", - "δυσλεκτικός": "δυσλεξικός", - "δυσλεκτική": "δυσλεξική", - "δυσλεκτικό": "δυσλεξικό", - "εκδοµένος": "εκδεδοµένος", - "εκδοµένη": "εκδεδοµένη", - "εκδοµένο": "εκδεδοµένο", - "ελεύτερος": "ελεύθερος", - "ελεύτερη": "ελεύθερη", - "ελεύτερο": "ελεύθερο", - "εξώφθαλµος": "εξόφθαλµος", - "εξώφθαλµη": "εξόφθαλµη", - "εξώφθαλµο": "εξόφθαλµο", - "επανωτός": "απανωτός", - "επανωτή": "απανωτή", - "επανωτό": "απανωτό", - "επεξηγητικος": "επεξηγηµατικός", - "επεξηγητικη": "επεξηγηµατική", - "επεξηγητικο": "επεξηγηµατικό", - "έρµος": "έρηµος", - "έρµη": "έρηµη", - "έρµο": "έρηµο", - "ετερόκλητος": "ετερόκλιτος", - "ετερόκλητη": "ετερόκλιτη", - "ετερόκλητο": "ετερόκλιτο", - "ετούτος": "τούτος", - "ετούτη": "τούτη", - "ετούτο": "τούτο", - "εφετεινός": "εφετινός", - "εφετεινή": "εφετινή", - "εφετεινό": "εφετινό", - "εφταήµερος": "επταήµερος", - "εφταήµερη": "επταήµερη", - "εφταήµερο": "επταήµερο", - "ζάµπλουτος": "ζάπλουτος", - "ζάµπλουτη": "ζάπλουτη", - "ζάµπλουτο": "ζάπλουτο", - "ζαχαράτος": "ζαχαρωτός", - "ζαχαράτη": "ζαχαρωτή", - "ζαχαράτο": "ζαχαρωτό", - "θαµβός": "θαµπός", - "θαµβή": "θαµπή", - "θαµβό": "θαµπό", - "θραψερός": "θρεψερός", - "θραψερή": "θρεψερή", - "θραψερό": "θρεψερό", - "ιονικός": "ιοντικός", - "ιονική": "ιοντική", - "ιονικό": "ιοντικό", - "καββαλιστικός": "καβαλιστικός", - "καββαλιστική": "καβαλιστική", - "καββαλιστικό": "καβαλιστικό", - "καλλίτερος": "καλύτερος", - "καλλίτερη": "καλύτερη", - "καλλίτερο": "καλύτερο", - "καταχτητικός": "κατακτητικός", - "καταχτητική": "κατακτητική", - "καταχτητικό": "κατακτητικό", - "καταψυγµένος": "κατεψυγµένος", - "καταψυγµένη": "κατεψυγµένη", - "καταψυγµένο": "κατεψυγµένο", - "καυδιανός": "καβδιανός", - "καυδιανή": "καβδιανή", - "καυδιανό": "καβδιανό", - "καϋµένος": "καηµένος", - "καϋµένη": "καηµένη", - "καϋµένο": "καηµένο", - "κέδρινος": "κέδρος", - "κέδρινη": "κέδρη", - "κέδρινο": "κέδρο", - "κεραµεικος": "κεραµικός", - "κεραµεικη": "κεραµική", - "κεραµεικο": "κεραµικό", - "κλασσικός": "κλασικός", - "κλασσική": "κλασική", - "κλασσικό": "κλασικό", - "κόλαριστός": "κολλαριστός", - "κόλαριστή": "κολλαριστή", - "κόλαριστό": "κολλαριστό", - "κοµµουνιστικός": "κοµουνιστικός", - "κοµµουνιστική": "κοµουνιστική", - "κοµµουνιστικό": "κοµουνιστικό", - "κοράλλινος": "κοραλλένιος", - "κοράλλινη": "κοραλλένιη", - "κοράλλινο": "κοραλλένιο", - "κτυπητός": "χτυπητός", - "κτυπητή": "χτυπητή", - "κτυπητό": "χτυπητό", - "κωφός": "κουφός", - "κωφή": "κουφή", - "κωφό": "κουφό", - "λειπανάβατος": "λειψανάβατος", - "λειπανάβατη": "λειψανάβατη", - "λειπανάβατο": "λειψανάβατο", - "λιανικός": "λειανικός", - "λιανική": "λειανική", - "λιανικό": "λειανικό", - "λιανός": "λειανός", - "λιανή": "λειανή", - "λιανό": "λειανό", - "λιγοήµερος": "ολιγοήµερος", - "λιγοήµερη": "ολιγοήµερη", - "λιγοήµερο": "ολιγοήµερο", - "λιγόκαρδος": "ολιγόκαρδος", - "λιγόκαρδη": "ολιγόκαρδη", - "λιγόκαρδο": "ολιγόκαρδο", - "λιγόλογος": "ολιγόλογος", - "λιγόλογη": "ολιγόλογη", - "λιγόλογο": "ολιγόλογο", - "λιγόπιστος": "ολιγόπιστος", - "λιγόπιστη": "ολιγόπιστη", - "λιγόπιστο": "ολιγόπιστο", - "λιγόψυχος": "ολιγοψυχία", - "λιγόψυχοςή": "ολιγοψυχίαη", - "λιγόψυχοςό": "ολιγοψυχίαο", - "λιόλουστος": "ηλιόλουστος", - "λιόλουστη": "ηλιόλουστη", - "λιόλουστο": "ηλιόλουστο", - "λιόµορφος": "ηλιόµορφος", - "λιόµορφη": "ηλιόµορφη", - "λιόµορφο": "ηλιόµορφο", - "λιόχαρος": "ηλιόχαρος", - "λιόχαρη": "ηλιόχαρη", - "λιόχαρο": "ηλιόχαρο", - "λιπανάβατος": "λειψανάβατος", - "λιπανάβατη": "λειψανάβατη", - "λιπανάβατο": "λειψανάβατο", - "λυµφατικός": "λεµφατικός", - "λυµφατική": "λεµφατική", - "λυµφατικό": "λεµφατικό", - "µαυριδερός": "µαυρειδερός", - "µαυριδερή": "µαυρειδερή", - "µαυριδερό": "µαυρειδερό", - "µεικτός": "µικτός", - "µεικτή": "µικτή", - "µεικτό": "µικτό", - "µελαψός": "µελαµψός", - "µελαψή": "µελαµψή", - "µελαψό": "µελαµψό", - "µετάξινος": "µεταξένιος", - "µετάξινη": "µεταξένιη", - "µετάξινο": "µεταξένιο", - "µιξοβάρβαρος": "µειξοβάρβαρος", - "µιξοβάρβαρη": "µειξοβάρβαρη", - "µιξοβάρβαρο": "µειξοβάρβαρο", - "µοσκαναθρεµµένος": "µοσχαναθρεµµένος", - "µοσκαναθρεµµένη": "µοσχαναθρεµµένη", - "µοσκαναθρεµµένο": "µοσχαναθρεµµένο", - "µουλωχτός": "µουλλωχτός", - "µουλωχτή": "µουλλωχτή", - "µουλωχτό": "µουλλωχτό", - "µπαµπακερός": "βαµβακερός", - "µπαµπακερή": "βαµβακερή", - "µπαµπακερό": "βαµβακερό", - "νεόχτιστος": "νεόκτιστος", - "νεόχτιστη": "νεόκτιστη", - "νεόχτιστο": "νεόκτιστο", - "νηστίσιµος": "νηστήσιµος", - "νηστίσιµη": "νηστήσιµη", - "νηστίσιµο": "νηστήσιµο", - "νιογέννητος": "νεογέννητος", - "νιογέννητη": "νεογέννητη", - "νιογέννητο": "νεογέννητο", - "νυκτερινός": "νυχτερινός", - "νυκτερινή": "νυχτερινή", - "νυκτερινό": "νυχτερινό", - "ξιπόλητος": "ξυπόλυτος", - "ξιπόλητη": "ξυπόλυτη", - "ξιπόλητο": "ξυπόλυτο", - "ξυνός": "ξινός", - "ξυνή": "ξινή", - "ξυνό": "ξινό", - "ξωτικός": "εξωτικός", - "ξωτική": "εξωτική", - "ξωτικό": "εξωτικό", - "οικονοµίστικος": "οικονοµικίστικος", - "οικονοµίστικη": "οικονοµικίστικη", - "οικονοµίστικο": "οικονοµικίστικο", - "οκταγωνικός": "οχταγωνικός", - "οκταγωνική": "οχταγωνική", - "οκταγωνικό": "οχταγωνικό", - "οκτάγωνος": "οχτάγωνος", - "οκτάγωνη": "οχτάγωνη", - "οκτάγωνο": "οχτάγωνο", - "οκτάεδρος": "οχτάεδρος", - "οκτάεδρη": "οχτάεδρη", - "οκτάεδρο": "οχτάεδρο", - "οκτάκιλος": "οχτάκιλος", - "οκτάκιλη": "οχτάκιλη", - "οκτάκιλο": "οχτάκιλο", - "οξειδώσιµος": "οξιδώσιµος", - "οξειδώσιµη": "οξιδώσιµη", - "οξειδώσιµο": "οξιδώσιµο", - "ορεχτικός": "ορεκτικός", - "ορεχτική": "ορεκτική", - "ορεχτικό": "ορεκτικό", - "οχταγωνικός": "οκταγωνικός", - "οχταγωνική": "οκταγωνική", - "οχταγωνικό": "οκταγωνικό", - "οχτάγωνος": "οκτάγωνος", - "οχτάγωνη": "οκτάγωνη", - "οχτάγωνο": "οκτάγωνο", - "οχτάεδρος": "οκτάεδρος", - "οχτάεδρη": "οκτάεδρη", - "οχτάεδρο": "οκτάεδρο", - "οχτακοσιοστός": "οκτακοσιοστός", - "οχτακοσιοστή": "οκτακοσιοστή", - "οχτακοσιοστό": "οκτακοσιοστό", - "οχτάπλευρος": "οκτάπλευρος", - "οχτάπλευρη": "οκτάπλευρη", - "οχτάπλευρο": "οκτάπλευρο", - "οχτάστηλος": "οκτάστηλος", - "οχτάστηλη": "οκτάστηλη", - "οχτάστηλο": "οκτάστηλο", - "οχτάστιχος": "οκτάστιχος", - "οχτάστιχη": "οκτάστιχη", - "οχτάστιχο": "οκτάστιχο", - "οχτάωρος": "οκτάωρος", - "οχτάωρη": "οκτάωρη", - "οχτάωρο": "οκτάωρο", - "οχτωβριανός": "οκτωβριανός", - "οχτωβριανή": "οκτωβριανή", - "οχτωβριανό": "οκτωβριανό", - "παιδιακίστικος": "παιδιάστικος", - "παιδιακίστικη": "παιδιάστικη", - "παιδιακίστικο": "παιδιάστικο", - "πανέρµος": "πανέρηµος", - "πανέρµη": "πανέρηµη", - "πανέρµο": "πανέρηµο", - "παπαδικός": "παππαδικός", - "παπαδική": "παππαδική", - "παπαδικό": "παππαδικό", - "παπαδίστικος": "παππαδίστικος", - "παπαδίστικη": "παππαδίστικη", - "παπαδίστικο": "παππαδίστικο", - "παραεκκλησιαστικός": "παρεκκλησιαστικός", - "παραεκκλησιαστική": "παρεκκλησιαστική", - "παραεκκλησιαστικό": "παρεκκλησιαστικό", - "πειρακτικός": "πειραχτικός", - "πειρακτική": "πειραχτική", - "πειρακτικό": "πειραχτικό", - "περήφανος": "υπερήφανος", - "περήφανη": "υπερήφανη", - "περήφανο": "υπερήφανο", - "περσότερος": "περισσότερος", - "περσότερη": "περισσότερη", - "περσότερο": "περισσότερο", - "πεταγµένος": "πεταµένος", - "πεταγµένη": "πεταµένη", - "πεταγµένο": "πεταµένο", - "πηκτός": "πηχτός", - "πηκτή": "πηχτή", - "πηκτό": "πηχτό", - "πιτσιλιστός": "πιτσυλιστός", - "πιτσιλιστή": "πιτσυλιστή", - "πιτσιλιστό": "πιτσυλιστό", - "πλεχτικός": "πλεκτικός", - "πλεχτική": "πλεκτική", - "πλεχτικό": "πλεκτικό", - "πλεχτός": "πλεκτός", - "πλεχτή": "πλεκτή", - "πλεχτό": "πλεκτό", - "προσεχτικός": "προσεκτικός", - "προσεχτική": "προσεκτική", - "προσεχτικό": "προσεκτικό", - "προψεσινός": "προχθεσινός", - "προψεσινή": "προχθεσινή", - "προψεσινό": "προχθεσινό", - "πτερωτός": "φτερωτός", - "πτερωτή": "φτερωτή", - "πτερωτό": "φτερωτό", - "πτωχικός": "φτωχικός", - "πτωχική": "φτωχική", - "πτωχικό": "φτωχικό", - "ραφτικός": "ραπτικός", - "ραφτική": "ραπτική", - "ραφτικό": "ραπτικό", - "ραφτός": "ραπτός", - "ραφτή": "ραπτή", - "ραφτό": "ραπτό", - "ρούσικος": "ρωσικός", - "ρούσικη": "ρωσική", - "ρούσικο": "ρωσικό", - "ρωµαντικός": "ροµαντικός", - "ρωµαντική": "ροµαντική", - "ρωµαντικό": "ροµαντικό", - "σειληνικός": "σιληνικός", - "σειληνική": "σιληνική", - "σειληνικό": "σιληνικό", - "σειριακός": "σειραϊκός", - "σειριακή": "σειραϊκή", - "σειριακό": "σειραϊκό", - "σεξπιρικός": "σαιξπηρικός", - "σεξπιρική": "σαιξπηρική", - "σεξπιρικό": "σαιξπηρικό", - "σιδηρόφρακτος": "σιδερόφραχτος", - "σιδηρόφρακτη": "σιδερόφραχτη", - "σιδηρόφρακτο": "σιδερόφραχτο", - "σκεβρός": "σκευρός", - "σκεβρή": "σκευρή", - "σκεβρό": "σκευρό", - "σκεφτικός": "σκεπτικός", - "σκεφτική": "σκεπτική", - "σκεφτικό": "σκεπτικό", - "σκιστός": "σχιστός", - "σκιστή": "σχιστή", - "σκιστό": "σχιστό", - "σκολιανός": "σχολιανός", - "σκολιανή": "σχολιανή", - "σκολιανό": "σχολιανό", - "σκοτσέζικος": "σκοτσέζικος", - "σκοτσέζικη": "σκοτσέζικη", - "σκοτσέζικο": "σκοτσέζικο", - "σµυρνιώτικος": "σµυρναίικος", - "σµυρνιώτικη": "σµυρναίικη", - "σµυρνιώτικο": "σµυρναίικο", - "σοροπιαστός": "σιροπιαστός", - "σοροπιαστή": "σιροπιαστή", - "σοροπιαστό": "σιροπιαστό", - "σπερνός": "εσπερινός", - "σπερνή": "εσπερινή", - "σπερνό": "εσπερινό", - "σταρόχρωµος": "σιταρόχρωµος", - "σταρόχρωµη": "σιταρόχρωµη", - "σταρόχρωµο": "σιταρόχρωµο", - "στενάχωρος": "στενόχωρος", - "στενάχωρη": "στενόχωρη", - "στενάχωρο": "στενόχωρο", - "στιλιστικός": "στυλιστικός", - "στιλιστική": "στυλιστική", - "στιλιστικό": "στυλιστικό", - "στριµόκωλος": "στρυµόκωλος", - "στριµόκωλη": "στρυµόκωλη", - "στριµόκωλο": "στρυµόκωλο", - "στριµωχτός": "στρυµωχτός", - "στριµωχτή": "στρυµωχτή", - "στριµωχτό": "στρυµωχτό", - "στριφνός": "στρυφνός", - "στριφνή": "στρυφνή", - "στριφνό": "στρυφνό", - "σύµµεικτος": "σύµµικτος", - "σύµµεικτη": "σύµµικτη", - "σύµµεικτο": "σύµµικτο", - "σύµψυχος": "σύψυχος", - "σύµψυχη": "σύψυχη", - "σύµψυχο": "σύψυχο", - "συντεθειµένος": "συνθέτω", - "συντεθειµένοςή": "συνθέτωη", - "συντεθειµένοςό": "συνθέτωο", - "συφοριασµένος": "συμφοριασμένος", - "συφοριασµένη": "συμφοριασμένη", - "συφοριασµένο": "συμφοριασμένο", - "συχωριανός": "συγχωριανός", - "συχωριανή": "συγχωριανή", - "συχωριανό": "συγχωριανό", - "ταγκός": "ταγγός", - "ταγκή": "ταγγή", - "ταµιευτικός": "αποταµιευτικός", - "ταµιευτική": "αποταµιευτική", - "ταµιευτικό": "αποταµιευτικό", - "ταχτικός": "τακτικός", - "ταχτική": "τακτική", - "ταχτικό": "τακτικό", - "τελολογικός": "τελεολογικός", - "τελολογική": "τελεολογική", - "τελολογικό": "τελεολογικό", - "τραγικοκωµικός": "κωµικοτραγικός", - "τραγικοκωµική": "κωµικοτραγική", - "τραγικοκωµικό": "κωµικοτραγικό", - "τρελλός": "τρελός", - "τρελλή": "τρελή", - "τρελλό": "τρελό", - "τσεβδός": "τσευδός", - "τσεβδή": "τσευδή", - "τσεβδό": "τσευδό", - "τσιριχτός": "τσυριχτός", - "τσιριχτή": "τσυριχτή", - "τσιριχτό": "τσυριχτό", - "τσιτωτός": "τσητωτός", - "τσιτωτή": "τσητωτή", - "τσιτωτό": "τσητωτό", - "υποµονητικός": "υποµονετικός", - "υποµονητική": "υποµονετική", - "υποµονητικό": "υποµονετικό", - "φαµφαρονικός": "φανφαρονίστικος", - "φαµφαρονική": "φανφαρονίστικη", - "φαµφαρονικό": "φανφαρονίστικο", - "φαµφαρονίστικος": "φανφαρονίστικος", - "φαµφαρονίστικη": "φανφαρονίστικη", - "φαµφαρονίστικο": "φανφαρονίστικο", - "φαντός": "υφαντός", - "φαντή": "υφαντή", - "φαντό": "υφαντό", - "φανφαρονικός": "φανφαρονιστικός", - "φανφαρονική": "φανφαρονιστική", - "φανφαρονικό": "φανφαρονιστικό", - "φαρακλός": "φαλακρός", - "φαρακλή": "φαλακρή", - "φαρακλό": "φαλακρό", - "φεγγαροφώτιστος": "φεγγαρόφωτος", - "φεγγαροφώτιστη": "φεγγαρόφωτη", - "φεγγαροφώτιστο": "φεγγαρόφωτο", - "φεουδαλικός": "φεουδαρχικός", - "φεουδαλική": "φεουδαρχική", - "φεουδαλικό": "φεουδαρχικό", - "φλοκάτος": "φλοκωτός", - "φλοκάτη": "φλοκωτή", - "φλοκάτο": "φλοκωτό", - "φριχτός": "φρικτός", - "φριχτή": "φρικτή", - "φριχτό": "φρικτό", - "φροϋδικός": "φροϊδικός", - "φροϋδική": "φροϊδική", - "φροϋδικό": "φροϊδικό", - "φτειαστός": "φτειαχτός", - "φτειαστή": "φτειαχτή", - "φτειαστό": "φτειαχτό", - "φτηνός": "φθηνός", - "φτηνή": "φθηνή", - "φτηνό": "φθηνό", - "φυσιοθεραπευτικός": "φυσιοθεραπευτικός", - "φυσιοθεραπευτική": "φυσιοθεραπευτική", - "φυσιοθεραπευτικό": "φυσιοθεραπευτικό", - "φωβιστικός": "φοβιστικός", - "φωβιστική": "φοβιστική", - "φωβιστικό": "φοβιστικό", - "χαδεµένος": "χαϊδεµένος", - "χαδεµένη": "χαϊδεµένη", - "χαδεµένο": "χαϊδεµένο", - "χειλόφωνος": "χειλεόφωνος", - "χειλόφωνη": "χειλεόφωνη", - "χειλόφωνο": "χειλεόφωνο", - "χειροδύναµος": "χεροδύναµος", - "χειροδύναµη": "χεροδύναµη", - "χειροδύναµο": "χεροδύναµο", - "χηράµενος": "χηρευάµενος", - "χηράµενη": "χηρευάµενη", - "χηράµενο": "χηρευάµενο", - "χλωµός": "χλοµός", - "χλωµή": "χλοµή", - "χλωµό": "χλοµό", - "χνουδάτος": "χνουδωτός", - "χνουδάτη": "χνουδωτή", - "χνουδάτο": "χνουδωτό", - "χονδρός": "χοντρός", - "χονδρή": "χοντρή", - "χονδρό": "χοντρό", - "χουβαρντάδικος": "χουβαρντάς", - "χουβαρντάδικοςή": "χουβαρντάςη", - "χουβαρντάδικοςό": "χουβαρντάςο", - "χρεολυτικός": "χρεωλυτικός", - "χρεολυτική": "χρεωλυτική", - "χρεολυτικό": "χρεωλυτικό", - "χρησµοδοτικός": "χρησµοδοσία", - "χρησµοδοτική": "χρησµοδοσίαη", - "χρησµοδοτικό": "χρησµοδοσίαο", - "χρυσόπλεχτος": "χρυσόπλεκτος", - "χρυσόπλεχτη": "χρυσόπλεκτη", - "χρυσόπλεχτο": "χρυσόπλεκτο", - "χτεσινός": "χθεσινός", - "χτεσινή": "χθεσινή", - "χτεσινό": "χθεσινό", - "χτιστός": "κτιστός", - "χτιστή": "κτιστή", - "χτιστό": "κτιστό", - "αντρείος": "ανδρείος", - "αντρεία": "ανδρεία", - "αντρείο": "ανδρείο", - "αποποµπαίος": "αποδιοποµπαίος", - "αποποµπαία": "αποδιοποµπαία", - "αποποµπαίο": "αποδιοποµπαίο", - "γεραλεος": "γηραλέος", - "γεραλεα": "γηραλέα", - "γεραλεο": "γηραλέο", - "εντόπιος": "ντόπιος", - "εντόπια": "ντόπια", - "εντόπιο": "ντόπιο", - "εφταπλάσιος": "επταπλάσιος", - "εφταπλάσια": "επταπλάσια", - "εφταπλάσιο": "επταπλάσιο", - "ζούφιος": "τζούφιος", - "ζούφια": "τζούφια", - "ζούφιο": "τζούφιο", - "καθάριος": "καθάρειος", - "καθάρια": "καθάρεια", - "καθάριο": "καθάρειο", - "λαφήσιος": "ελαφήσιος", - "λαφήσια": "ελαφήσια", - "λαφήσιο": "ελαφήσιο", - "οκταθέσιος": "οχταθέσιος", - "οκταθέσια": "οχταθέσια", - "οκταθέσιο": "οχταθέσιο", - "ονυχαίος": "ονυχιαίος", - "ονυχαία": "ονυχιαία", - "ονυχαίο": "ονυχιαίο", - "οχταπλάσιος": "οκταπλάσιος", - "οχταπλάσια": "οκταπλάσια", - "οχταπλάσιο": "οκταπλάσιο", - "βοϊδήσιος": "βοδινός", - "βοϊδήσια": "βοδινή", - "βοϊδήσιο": "βοδινό", - "καλαµποκίσιος": "καλαµποκήσιος", - "καλαµποκίσια": "καλαµποκήσια", - "καλαµποκίσιο": "καλαµποκήσιο", - "κεφαλίσιος": "κεφαλήσιος", - "κεφαλίσια": "κεφαλήσια", - "κεφαλίσιο": "κεφαλήσιο", - "κρουσταλλένιος": "κρυσταλλένιος", - "κρουσταλλένια": "κρυσταλλένια", - "κρουσταλλένιο": "κρυσταλλένιο", - "µοσκαρήσιος": "µοσχαρήσιος", - "µοσκαρήσια": "µοσχαρήσια", - "µοσκαρήσιο": "µοσχαρήσιο", - "παλικαρήσιος": "παλληκαρήσιος", - "παλικαρήσια": "παλληκαρήσια", - "παλικαρήσιο": "παλληκαρήσιο", - "πετρένιος": "πέτρινος", - "πετρένια": "πέτρινη", - "πετρένιο": "πέτρινο", - "σιταρένιος": "σταρένιος", - "σιταρένια": "σταρένια", - "σιταρένιο": "σταρένιο", - "σκυλίσιος": "σκυλήσιος", - "σκυλίσια": "σκυλήσια", - "σκυλίσιο": "σκυλήσιο", - "χελίσιος": "χελήσιος", - "χελίσια": "χελήσια", - "χελίσιο": "χελήσιο", - "χελωνίσιος": "χελωνήσιος", - "χελωνίσια": "χελωνήσια", - "χελωνίσιο": "χελωνήσιο", - "γουρσούζης": "γρουσούζης", - "γουρσούζα": "γρουσούζα", - "γουρσούζικο": "γρουσούζικο", - "γρινιάρης": "γκρινιάρης", - "γρινιάρα": "γκρινιάρα", - "γρινιάρικο": "γκρινιάρικο", - "λιχούδης": "λειχούδης", - "λιχούδα": "λειχούδα", - "λιχούδικο": "λειχούδικο", - "µαργιόλής": "µαριόλης", - "µαργιόλήςα": "µαριόλα", - "µαργιόλήςικο": "µαριόλικο", - "ξεκουτιάρης": "ξεκούτης", - "ξεκουτιάρα": "ξεκούτα", - "ξεκουτιάρικο": "ξεκούτικο", - "σκανδαλιάρης": "σκανταλιάρης", - "σκανδαλιάρα": "σκανταλιάρα", - "σκανδαλιάρικο": "σκανταλιάρικο", - "τσιγκούνης": "τσιγγούνης", - "τσιγκούνα": "τσιγγούνα", - "τσιγκούνικο": "τσιγγούνικο", -} - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index fca4e01e7..4304b3c6a 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc def _return_en(_): @@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = _return_en - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py deleted file mode 100644 index a2cf58b8a..000000000 --- a/spacy/lang/en/norm_exceptions.py +++ /dev/null @@ -1,1768 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Slang and abbreviations - "cos": "because", - "cuz": "because", - "fav": "favorite", - "fave": "favorite", - "misc": "miscellaneous", - "plz": "please", - "pls": "please", - "thx": "thanks", - # US vs. UK spelling - "accessorise": "accessorize", - "accessorised": "accessorized", - "accessorises": "accessorizes", - "accessorising": "accessorizing", - "acclimatisation": "acclimatization", - "acclimatise": "acclimatize", - "acclimatised": "acclimatized", - "acclimatises": "acclimatizes", - "acclimatising": "acclimatizing", - "accoutrements": "accouterments", - "aeon": "eon", - "aeons": "eons", - "aerogramme": "aerogram", - "aerogrammes": "aerograms", - "aeroplane": "airplane", - "aeroplanes ": "airplanes ", - "aesthete": "esthete", - "aesthetes": "esthetes", - "aesthetic": "esthetic", - "aesthetically": "esthetically", - "aesthetics": "esthetics", - "aetiology": "etiology", - "ageing": "aging", - "aggrandisement": "aggrandizement", - "agonise": "agonize", - "agonised": "agonized", - "agonises": "agonizes", - "agonising": "agonizing", - "agonisingly": "agonizingly", - "almanack": "almanac", - "almanacks": "almanacs", - "aluminium": "aluminum", - "amortisable": "amortizable", - "amortisation": "amortization", - "amortisations": "amortizations", - "amortise": "amortize", - "amortised": "amortized", - "amortises": "amortizes", - "amortising": "amortizing", - "amphitheatre": "amphitheater", - "amphitheatres": "amphitheaters", - "anaemia": "anemia", - "anaemic": "anemic", - "anaesthesia": "anesthesia", - "anaesthetic": "anesthetic", - "anaesthetics": "anesthetics", - "anaesthetise": "anesthetize", - "anaesthetised": "anesthetized", - "anaesthetises": "anesthetizes", - "anaesthetising": "anesthetizing", - "anaesthetist": "anesthetist", - "anaesthetists": "anesthetists", - "anaesthetize": "anesthetize", - "anaesthetized": "anesthetized", - "anaesthetizes": "anesthetizes", - "anaesthetizing": "anesthetizing", - "analogue": "analog", - "analogues": "analogs", - "analyse": "analyze", - "analysed": "analyzed", - "analyses": "analyzes", - "analysing": "analyzing", - "anglicise": "anglicize", - "anglicised": "anglicized", - "anglicises": "anglicizes", - "anglicising": "anglicizing", - "annualised": "annualized", - "antagonise": "antagonize", - "antagonised": "antagonized", - "antagonises": "antagonizes", - "antagonising": "antagonizing", - "apologise": "apologize", - "apologised": "apologized", - "apologises": "apologizes", - "apologising": "apologizing", - "appal": "appall", - "appals": "appalls", - "appetiser": "appetizer", - "appetisers": "appetizers", - "appetising": "appetizing", - "appetisingly": "appetizingly", - "arbour": "arbor", - "arbours": "arbors", - "archaeological": "archeological", - "archaeologically": "archeologically", - "archaeologist": "archeologist", - "archaeologists": "archeologists", - "archaeology": "archeology", - "ardour": "ardor", - "armour": "armor", - "armoured": "armored", - "armourer": "armorer", - "armourers": "armorers", - "armouries": "armories", - "armoury": "armory", - "artefact": "artifact", - "artefacts": "artifacts", - "authorise": "authorize", - "authorised": "authorized", - "authorises": "authorizes", - "authorising": "authorizing", - "axe": "ax", - "backpedalled": "backpedaled", - "backpedalling": "backpedaling", - "bannister": "banister", - "bannisters": "banisters", - "baptise": "baptize", - "baptised": "baptized", - "baptises": "baptizes", - "baptising": "baptizing", - "bastardise": "bastardize", - "bastardised": "bastardized", - "bastardises": "bastardizes", - "bastardising": "bastardizing", - "battleaxe": "battleax", - "baulk": "balk", - "baulked": "balked", - "baulking": "balking", - "baulks": "balks", - "bedevilled": "bedeviled", - "bedevilling": "bedeviling", - "behaviour": "behavior", - "behavioural": "behavioral", - "behaviourism": "behaviorism", - "behaviourist": "behaviorist", - "behaviourists": "behaviorists", - "behaviours": "behaviors", - "behove": "behoove", - "behoved": "behooved", - "behoves": "behooves", - "bejewelled": "bejeweled", - "belabour": "belabor", - "belaboured": "belabored", - "belabouring": "belaboring", - "belabours": "belabors", - "bevelled": "beveled", - "bevvies": "bevies", - "bevvy": "bevy", - "biassed": "biased", - "biassing": "biasing", - "bingeing": "binging", - "bougainvillaea": "bougainvillea", - "bougainvillaeas": "bougainvilleas", - "bowdlerise": "bowdlerize", - "bowdlerised": "bowdlerized", - "bowdlerises": "bowdlerizes", - "bowdlerising": "bowdlerizing", - "breathalyse": "breathalyze", - "breathalysed": "breathalyzed", - "breathalyser": "breathalyzer", - "breathalysers": "breathalyzers", - "breathalyses": "breathalyzes", - "breathalysing": "breathalyzing", - "brutalise": "brutalize", - "brutalised": "brutalized", - "brutalises": "brutalizes", - "brutalising": "brutalizing", - "buses": "busses", - "busing": "bussing", - "caesarean": "cesarean", - "caesareans": "cesareans", - "calibre": "caliber", - "calibres": "calibers", - "calliper": "caliper", - "callipers": "calipers", - "callisthenics": "calisthenics", - "canalise": "canalize", - "canalised": "canalized", - "canalises": "canalizes", - "canalising": "canalizing", - "cancellation": "cancelation", - "cancellations": "cancelations", - "cancelled": "canceled", - "cancelling": "canceling", - "candour": "candor", - "cannibalise": "cannibalize", - "cannibalised": "cannibalized", - "cannibalises": "cannibalizes", - "cannibalising": "cannibalizing", - "canonise": "canonize", - "canonised": "canonized", - "canonises": "canonizes", - "canonising": "canonizing", - "capitalise": "capitalize", - "capitalised": "capitalized", - "capitalises": "capitalizes", - "capitalising": "capitalizing", - "caramelise": "caramelize", - "caramelised": "caramelized", - "caramelises": "caramelizes", - "caramelising": "caramelizing", - "carbonise": "carbonize", - "carbonised": "carbonized", - "carbonises": "carbonizes", - "carbonising": "carbonizing", - "carolled": "caroled", - "carolling": "caroling", - "catalogue": "catalog", - "catalogued": "cataloged", - "catalogues": "catalogs", - "cataloguing": "cataloging", - "catalyse": "catalyze", - "catalysed": "catalyzed", - "catalyses": "catalyzes", - "catalysing": "catalyzing", - "categorise": "categorize", - "categorised": "categorized", - "categorises": "categorizes", - "categorising": "categorizing", - "cauterise": "cauterize", - "cauterised": "cauterized", - "cauterises": "cauterizes", - "cauterising": "cauterizing", - "cavilled": "caviled", - "cavilling": "caviling", - "centigramme": "centigram", - "centigrammes": "centigrams", - "centilitre": "centiliter", - "centilitres": "centiliters", - "centimetre": "centimeter", - "centimetres": "centimeters", - "centralise": "centralize", - "centralised": "centralized", - "centralises": "centralizes", - "centralising": "centralizing", - "centre": "center", - "centred": "centered", - "centrefold": "centerfold", - "centrefolds": "centerfolds", - "centrepiece": "centerpiece", - "centrepieces": "centerpieces", - "centres": "centers", - "channelled": "channeled", - "channelling": "channeling", - "characterise": "characterize", - "characterised": "characterized", - "characterises": "characterizes", - "characterising": "characterizing", - "cheque": "check", - "chequebook": "checkbook", - "chequebooks": "checkbooks", - "chequered": "checkered", - "cheques": "checks", - "chilli": "chili", - "chimaera": "chimera", - "chimaeras": "chimeras", - "chiselled": "chiseled", - "chiselling": "chiseling", - "circularise": "circularize", - "circularised": "circularized", - "circularises": "circularizes", - "circularising": "circularizing", - "civilise": "civilize", - "civilised": "civilized", - "civilises": "civilizes", - "civilising": "civilizing", - "clamour": "clamor", - "clamoured": "clamored", - "clamouring": "clamoring", - "clamours": "clamors", - "clangour": "clangor", - "clarinettist": "clarinetist", - "clarinettists": "clarinetists", - "collectivise": "collectivize", - "collectivised": "collectivized", - "collectivises": "collectivizes", - "collectivising": "collectivizing", - "colonisation": "colonization", - "colonise": "colonize", - "colonised": "colonized", - "coloniser": "colonizer", - "colonisers": "colonizers", - "colonises": "colonizes", - "colonising": "colonizing", - "colour": "color", - "colourant": "colorant", - "colourants": "colorants", - "coloured": "colored", - "coloureds": "coloreds", - "colourful": "colorful", - "colourfully": "colorfully", - "colouring": "coloring", - "colourize": "colorize", - "colourized": "colorized", - "colourizes": "colorizes", - "colourizing": "colorizing", - "colourless": "colorless", - "colours": "colors", - "commercialise": "commercialize", - "commercialised": "commercialized", - "commercialises": "commercializes", - "commercialising": "commercializing", - "compartmentalise": "compartmentalize", - "compartmentalised": "compartmentalized", - "compartmentalises": "compartmentalizes", - "compartmentalising": "compartmentalizing", - "computerise": "computerize", - "computerised": "computerized", - "computerises": "computerizes", - "computerising": "computerizing", - "conceptualise": "conceptualize", - "conceptualised": "conceptualized", - "conceptualises": "conceptualizes", - "conceptualising": "conceptualizing", - "connexion": "connection", - "connexions": "connections", - "contextualise": "contextualize", - "contextualised": "contextualized", - "contextualises": "contextualizes", - "contextualising": "contextualizing", - "cosier": "cozier", - "cosies": "cozies", - "cosiest": "coziest", - "cosily": "cozily", - "cosiness": "coziness", - "cosy": "cozy", - "councillor": "councilor", - "councillors": "councilors", - "counselled": "counseled", - "counselling": "counseling", - "counsellor": "counselor", - "counsellors": "counselors", - "crenellated": "crenelated", - "criminalise": "criminalize", - "criminalised": "criminalized", - "criminalises": "criminalizes", - "criminalising": "criminalizing", - "criticise": "criticize", - "criticised": "criticized", - "criticises": "criticizes", - "criticising": "criticizing", - "crueller": "crueler", - "cruellest": "cruelest", - "crystallisation": "crystallization", - "crystallise": "crystallize", - "crystallised": "crystallized", - "crystallises": "crystallizes", - "crystallising": "crystallizing", - "cudgelled": "cudgeled", - "cudgelling": "cudgeling", - "customise": "customize", - "customised": "customized", - "customises": "customizes", - "customising": "customizing", - "cypher": "cipher", - "cyphers": "ciphers", - "decentralisation": "decentralization", - "decentralise": "decentralize", - "decentralised": "decentralized", - "decentralises": "decentralizes", - "decentralising": "decentralizing", - "decriminalisation": "decriminalization", - "decriminalise": "decriminalize", - "decriminalised": "decriminalized", - "decriminalises": "decriminalizes", - "decriminalising": "decriminalizing", - "defence": "defense", - "defenceless": "defenseless", - "defences": "defenses", - "dehumanisation": "dehumanization", - "dehumanise": "dehumanize", - "dehumanised": "dehumanized", - "dehumanises": "dehumanizes", - "dehumanising": "dehumanizing", - "demeanour": "demeanor", - "demilitarisation": "demilitarization", - "demilitarise": "demilitarize", - "demilitarised": "demilitarized", - "demilitarises": "demilitarizes", - "demilitarising": "demilitarizing", - "demobilisation": "demobilization", - "demobilise": "demobilize", - "demobilised": "demobilized", - "demobilises": "demobilizes", - "demobilising": "demobilizing", - "democratisation": "democratization", - "democratise": "democratize", - "democratised": "democratized", - "democratises": "democratizes", - "democratising": "democratizing", - "demonise": "demonize", - "demonised": "demonized", - "demonises": "demonizes", - "demonising": "demonizing", - "demoralisation": "demoralization", - "demoralise": "demoralize", - "demoralised": "demoralized", - "demoralises": "demoralizes", - "demoralising": "demoralizing", - "denationalisation": "denationalization", - "denationalise": "denationalize", - "denationalised": "denationalized", - "denationalises": "denationalizes", - "denationalising": "denationalizing", - "deodorise": "deodorize", - "deodorised": "deodorized", - "deodorises": "deodorizes", - "deodorising": "deodorizing", - "depersonalise": "depersonalize", - "depersonalised": "depersonalized", - "depersonalises": "depersonalizes", - "depersonalising": "depersonalizing", - "deputise": "deputize", - "deputised": "deputized", - "deputises": "deputizes", - "deputising": "deputizing", - "desensitisation": "desensitization", - "desensitise": "desensitize", - "desensitised": "desensitized", - "desensitises": "desensitizes", - "desensitising": "desensitizing", - "destabilisation": "destabilization", - "destabilise": "destabilize", - "destabilised": "destabilized", - "destabilises": "destabilizes", - "destabilising": "destabilizing", - "dialled": "dialed", - "dialling": "dialing", - "dialogue": "dialog", - "dialogues": "dialogs", - "diarrhoea": "diarrhea", - "digitise": "digitize", - "digitised": "digitized", - "digitises": "digitizes", - "digitising": "digitizing", - "disc": "disk", - "discolour": "discolor", - "discoloured": "discolored", - "discolouring": "discoloring", - "discolours": "discolors", - "discs": "disks", - "disembowelled": "disemboweled", - "disembowelling": "disemboweling", - "disfavour": "disfavor", - "dishevelled": "disheveled", - "dishonour": "dishonor", - "dishonourable": "dishonorable", - "dishonourably": "dishonorably", - "dishonoured": "dishonored", - "dishonouring": "dishonoring", - "dishonours": "dishonors", - "disorganisation": "disorganization", - "disorganised": "disorganized", - "distil": "distill", - "distils": "distills", - "doin": "doing", - "doin'": "doing", - "dramatisation": "dramatization", - "dramatisations": "dramatizations", - "dramatise": "dramatize", - "dramatised": "dramatized", - "dramatises": "dramatizes", - "dramatising": "dramatizing", - "draught": "draft", - "draughtboard": "draftboard", - "draughtboards": "draftboards", - "draughtier": "draftier", - "draughtiest": "draftiest", - "draughts": "drafts", - "draughtsman": "draftsman", - "draughtsmanship": "draftsmanship", - "draughtsmen": "draftsmen", - "draughtswoman": "draftswoman", - "draughtswomen": "draftswomen", - "draughty": "drafty", - "drivelled": "driveled", - "drivelling": "driveling", - "duelled": "dueled", - "duelling": "dueling", - "economise": "economize", - "economised": "economized", - "economises": "economizes", - "economising": "economizing", - "edoema": "edema ", - "editorialise": "editorialize", - "editorialised": "editorialized", - "editorialises": "editorializes", - "editorialising": "editorializing", - "empathise": "empathize", - "empathised": "empathized", - "empathises": "empathizes", - "empathising": "empathizing", - "emphasise": "emphasize", - "emphasised": "emphasized", - "emphasises": "emphasizes", - "emphasising": "emphasizing", - "enamelled": "enameled", - "enamelling": "enameling", - "enamoured": "enamored", - "encyclopaedia": "encyclopedia", - "encyclopaedias": "encyclopedias", - "encyclopaedic": "encyclopedic", - "endeavour": "endeavor", - "endeavoured": "endeavored", - "endeavouring": "endeavoring", - "endeavours": "endeavors", - "energise": "energize", - "energised": "energized", - "energises": "energizes", - "energising": "energizing", - "enrol": "enroll", - "enrols": "enrolls", - "enthral": "enthrall", - "enthrals": "enthralls", - "epaulette": "epaulet", - "epaulettes": "epaulets", - "epicentre": "epicenter", - "epicentres": "epicenters", - "epilogue": "epilog", - "epilogues": "epilogs", - "epitomise": "epitomize", - "epitomised": "epitomized", - "epitomises": "epitomizes", - "epitomising": "epitomizing", - "equalisation": "equalization", - "equalise": "equalize", - "equalised": "equalized", - "equaliser": "equalizer", - "equalisers": "equalizers", - "equalises": "equalizes", - "equalising": "equalizing", - "eulogise": "eulogize", - "eulogised": "eulogized", - "eulogises": "eulogizes", - "eulogising": "eulogizing", - "evangelise": "evangelize", - "evangelised": "evangelized", - "evangelises": "evangelizes", - "evangelising": "evangelizing", - "exorcise": "exorcize", - "exorcised": "exorcized", - "exorcises": "exorcizes", - "exorcising": "exorcizing", - "extemporisation": "extemporization", - "extemporise": "extemporize", - "extemporised": "extemporized", - "extemporises": "extemporizes", - "extemporising": "extemporizing", - "externalisation": "externalization", - "externalisations": "externalizations", - "externalise": "externalize", - "externalised": "externalized", - "externalises": "externalizes", - "externalising": "externalizing", - "factorise": "factorize", - "factorised": "factorized", - "factorises": "factorizes", - "factorising": "factorizing", - "faecal": "fecal", - "faeces": "feces", - "familiarisation": "familiarization", - "familiarise": "familiarize", - "familiarised": "familiarized", - "familiarises": "familiarizes", - "familiarising": "familiarizing", - "fantasise": "fantasize", - "fantasised": "fantasized", - "fantasises": "fantasizes", - "fantasising": "fantasizing", - "favour": "favor", - "favourable": "favorable", - "favourably": "favorably", - "favoured": "favored", - "favouring": "favoring", - "favourite": "favorite", - "favourites": "favorites", - "favouritism": "favoritism", - "favours": "favors", - "feminise": "feminize", - "feminised": "feminized", - "feminises": "feminizes", - "feminising": "feminizing", - "fertilisation": "fertilization", - "fertilise": "fertilize", - "fertilised": "fertilized", - "fertiliser": "fertilizer", - "fertilisers": "fertilizers", - "fertilises": "fertilizes", - "fertilising": "fertilizing", - "fervour": "fervor", - "fibre": "fiber", - "fibreglass": "fiberglass", - "fibres": "fibers", - "fictionalisation": "fictionalization", - "fictionalisations": "fictionalizations", - "fictionalise": "fictionalize", - "fictionalised": "fictionalized", - "fictionalises": "fictionalizes", - "fictionalising": "fictionalizing", - "fillet": "filet", - "filleted ": "fileted ", - "filleting": "fileting", - "fillets ": "filets ", - "finalisation": "finalization", - "finalise": "finalize", - "finalised": "finalized", - "finalises": "finalizes", - "finalising": "finalizing", - "flautist": "flutist", - "flautists": "flutists", - "flavour": "flavor", - "flavoured": "flavored", - "flavouring": "flavoring", - "flavourings": "flavorings", - "flavourless": "flavorless", - "flavours": "flavors", - "flavoursome": "flavorsome", - "flyer / flier ": "flier / flyer ", - "foetal": "fetal", - "foetid": "fetid", - "foetus": "fetus", - "foetuses": "fetuses", - "formalisation": "formalization", - "formalise": "formalize", - "formalised": "formalized", - "formalises": "formalizes", - "formalising": "formalizing", - "fossilisation": "fossilization", - "fossilise": "fossilize", - "fossilised": "fossilized", - "fossilises": "fossilizes", - "fossilising": "fossilizing", - "fraternisation": "fraternization", - "fraternise": "fraternize", - "fraternised": "fraternized", - "fraternises": "fraternizes", - "fraternising": "fraternizing", - "fulfil": "fulfill", - "fulfilment": "fulfillment", - "fulfils": "fulfills", - "funnelled": "funneled", - "funnelling": "funneling", - "galvanise": "galvanize", - "galvanised": "galvanized", - "galvanises": "galvanizes", - "galvanising": "galvanizing", - "gambolled": "gamboled", - "gambolling": "gamboling", - "gaol": "jail", - "gaolbird": "jailbird", - "gaolbirds": "jailbirds", - "gaolbreak": "jailbreak", - "gaolbreaks": "jailbreaks", - "gaoled": "jailed", - "gaoler": "jailer", - "gaolers": "jailers", - "gaoling": "jailing", - "gaols": "jails", - "gases": "gasses", - "gauge": "gage", - "gauged": "gaged", - "gauges": "gages", - "gauging": "gaging", - "generalisation": "generalization", - "generalisations": "generalizations", - "generalise": "generalize", - "generalised": "generalized", - "generalises": "generalizes", - "generalising": "generalizing", - "ghettoise": "ghettoize", - "ghettoised": "ghettoized", - "ghettoises": "ghettoizes", - "ghettoising": "ghettoizing", - "gipsies": "gypsies", - "glamorise": "glamorize", - "glamorised": "glamorized", - "glamorises": "glamorizes", - "glamorising": "glamorizing", - "glamour": "glamor", - "globalisation": "globalization", - "globalise": "globalize", - "globalised": "globalized", - "globalises": "globalizes", - "globalising": "globalizing", - "glueing ": "gluing ", - "goin": "going", - "goin'": "going", - "goitre": "goiter", - "goitres": "goiters", - "gonorrhoea": "gonorrhea", - "gramme": "gram", - "grammes": "grams", - "gravelled": "graveled", - "grey": "gray", - "greyed": "grayed", - "greying": "graying", - "greyish": "grayish", - "greyness": "grayness", - "greys": "grays", - "grovelled": "groveled", - "grovelling": "groveling", - "groyne": "groin", - "groynes ": "groins", - "gruelling": "grueling", - "gruellingly": "gruelingly", - "gryphon": "griffin", - "gryphons": "griffins", - "gynaecological": "gynecological", - "gynaecologist": "gynecologist", - "gynaecologists": "gynecologists", - "gynaecology": "gynecology", - "haematological": "hematological", - "haematologist": "hematologist", - "haematologists": "hematologists", - "haematology": "hematology", - "haemoglobin": "hemoglobin", - "haemophilia": "hemophilia", - "haemophiliac": "hemophiliac", - "haemophiliacs": "hemophiliacs", - "haemorrhage": "hemorrhage", - "haemorrhaged": "hemorrhaged", - "haemorrhages": "hemorrhages", - "haemorrhaging": "hemorrhaging", - "haemorrhoids": "hemorrhoids", - "harbour": "harbor", - "harboured": "harbored", - "harbouring": "harboring", - "harbours": "harbors", - "harmonisation": "harmonization", - "harmonise": "harmonize", - "harmonised": "harmonized", - "harmonises": "harmonizes", - "harmonising": "harmonizing", - "havin": "having", - "havin'": "having", - "homoeopath": "homeopath", - "homoeopathic": "homeopathic", - "homoeopaths": "homeopaths", - "homoeopathy": "homeopathy", - "homogenise": "homogenize", - "homogenised": "homogenized", - "homogenises": "homogenizes", - "homogenising": "homogenizing", - "honour": "honor", - "honourable": "honorable", - "honourably": "honorably", - "honoured": "honored", - "honouring": "honoring", - "honours": "honors", - "hospitalisation": "hospitalization", - "hospitalise": "hospitalize", - "hospitalised": "hospitalized", - "hospitalises": "hospitalizes", - "hospitalising": "hospitalizing", - "humanise": "humanize", - "humanised": "humanized", - "humanises": "humanizes", - "humanising": "humanizing", - "humour": "humor", - "humoured": "humored", - "humouring": "humoring", - "humourless": "humorless", - "humours": "humors", - "hybridise": "hybridize", - "hybridised": "hybridized", - "hybridises": "hybridizes", - "hybridising": "hybridizing", - "hypnotise": "hypnotize", - "hypnotised": "hypnotized", - "hypnotises": "hypnotizes", - "hypnotising": "hypnotizing", - "hypothesise": "hypothesize", - "hypothesised": "hypothesized", - "hypothesises": "hypothesizes", - "hypothesising": "hypothesizing", - "idealisation": "idealization", - "idealise": "idealize", - "idealised": "idealized", - "idealises": "idealizes", - "idealising": "idealizing", - "idolise": "idolize", - "idolised": "idolized", - "idolises": "idolizes", - "idolising": "idolizing", - "immobilisation": "immobilization", - "immobilise": "immobilize", - "immobilised": "immobilized", - "immobiliser": "immobilizer", - "immobilisers": "immobilizers", - "immobilises": "immobilizes", - "immobilising": "immobilizing", - "immortalise": "immortalize", - "immortalised": "immortalized", - "immortalises": "immortalizes", - "immortalising": "immortalizing", - "immunisation": "immunization", - "immunise": "immunize", - "immunised": "immunized", - "immunises": "immunizes", - "immunising": "immunizing", - "impanelled": "impaneled", - "impanelling": "impaneling", - "imperilled": "imperiled", - "imperilling": "imperiling", - "individualise": "individualize", - "individualised": "individualized", - "individualises": "individualizes", - "individualising": "individualizing", - "industrialise": "industrialize", - "industrialised": "industrialized", - "industrialises": "industrializes", - "industrialising": "industrializing", - "inflexion": "inflection", - "inflexions": "inflections", - "initialise": "initialize", - "initialised": "initialized", - "initialises": "initializes", - "initialising": "initializing", - "initialled": "initialed", - "initialling": "initialing", - "instal": "install", - "instalment": "installment", - "instalments": "installments", - "instals": "installs", - "instil": "instill", - "instils": "instills", - "institutionalisation": "institutionalization", - "institutionalise": "institutionalize", - "institutionalised": "institutionalized", - "institutionalises": "institutionalizes", - "institutionalising": "institutionalizing", - "intellectualise": "intellectualize", - "intellectualised": "intellectualized", - "intellectualises": "intellectualizes", - "intellectualising": "intellectualizing", - "internalisation": "internalization", - "internalise": "internalize", - "internalised": "internalized", - "internalises": "internalizes", - "internalising": "internalizing", - "internationalisation": "internationalization", - "internationalise": "internationalize", - "internationalised": "internationalized", - "internationalises": "internationalizes", - "internationalising": "internationalizing", - "ionisation": "ionization", - "ionise": "ionize", - "ionised": "ionized", - "ioniser": "ionizer", - "ionisers": "ionizers", - "ionises": "ionizes", - "ionising": "ionizing", - "italicise": "italicize", - "italicised": "italicized", - "italicises": "italicizes", - "italicising": "italicizing", - "itemise": "itemize", - "itemised": "itemized", - "itemises": "itemizes", - "itemising": "itemizing", - "jeopardise": "jeopardize", - "jeopardised": "jeopardized", - "jeopardises": "jeopardizes", - "jeopardising": "jeopardizing", - "jewelled": "jeweled", - "jeweller": "jeweler", - "jewellers": "jewelers", - "jewellery": "jewelry", - "judgement ": "judgment", - "kilogramme": "kilogram", - "kilogrammes": "kilograms", - "kilometre": "kilometer", - "kilometres": "kilometers", - "labelled": "labeled", - "labelling": "labeling", - "labour": "labor", - "laboured": "labored", - "labourer": "laborer", - "labourers": "laborers", - "labouring": "laboring", - "labours": "labors", - "lacklustre": "lackluster", - "legalisation": "legalization", - "legalise": "legalize", - "legalised": "legalized", - "legalises": "legalizes", - "legalising": "legalizing", - "legitimise": "legitimize", - "legitimised": "legitimized", - "legitimises": "legitimizes", - "legitimising": "legitimizing", - "leukaemia": "leukemia", - "levelled": "leveled", - "leveller": "leveler", - "levellers": "levelers", - "levelling": "leveling", - "libelled": "libeled", - "libelling": "libeling", - "libellous": "libelous", - "liberalisation": "liberalization", - "liberalise": "liberalize", - "liberalised": "liberalized", - "liberalises": "liberalizes", - "liberalising": "liberalizing", - "licence": "license", - "licenced": "licensed", - "licences": "licenses", - "licencing": "licensing", - "likeable": "likable ", - "lionisation": "lionization", - "lionise": "lionize", - "lionised": "lionized", - "lionises": "lionizes", - "lionising": "lionizing", - "liquidise": "liquidize", - "liquidised": "liquidized", - "liquidiser": "liquidizer", - "liquidisers": "liquidizers", - "liquidises": "liquidizes", - "liquidising": "liquidizing", - "litre": "liter", - "litres": "liters", - "localise": "localize", - "localised": "localized", - "localises": "localizes", - "localising": "localizing", - "lovin": "loving", - "lovin'": "loving", - "louvre": "louver", - "louvred": "louvered", - "louvres": "louvers ", - "lustre": "luster", - "magnetise": "magnetize", - "magnetised": "magnetized", - "magnetises": "magnetizes", - "magnetising": "magnetizing", - "manoeuvrability": "maneuverability", - "manoeuvrable": "maneuverable", - "manoeuvre": "maneuver", - "manoeuvred": "maneuvered", - "manoeuvres": "maneuvers", - "manoeuvring": "maneuvering", - "manoeuvrings": "maneuverings", - "marginalisation": "marginalization", - "marginalise": "marginalize", - "marginalised": "marginalized", - "marginalises": "marginalizes", - "marginalising": "marginalizing", - "marshalled": "marshaled", - "marshalling": "marshaling", - "marvelled": "marveled", - "marvelling": "marveling", - "marvellous": "marvelous", - "marvellously": "marvelously", - "materialisation": "materialization", - "materialise": "materialize", - "materialised": "materialized", - "materialises": "materializes", - "materialising": "materializing", - "maximisation": "maximization", - "maximise": "maximize", - "maximised": "maximized", - "maximises": "maximizes", - "maximising": "maximizing", - "meagre": "meager", - "mechanisation": "mechanization", - "mechanise": "mechanize", - "mechanised": "mechanized", - "mechanises": "mechanizes", - "mechanising": "mechanizing", - "mediaeval": "medieval", - "memorialise": "memorialize", - "memorialised": "memorialized", - "memorialises": "memorializes", - "memorialising": "memorializing", - "memorise": "memorize", - "memorised": "memorized", - "memorises": "memorizes", - "memorising": "memorizing", - "mesmerise": "mesmerize", - "mesmerised": "mesmerized", - "mesmerises": "mesmerizes", - "mesmerising": "mesmerizing", - "metabolise": "metabolize", - "metabolised": "metabolized", - "metabolises": "metabolizes", - "metabolising": "metabolizing", - "metre": "meter", - "metres": "meters", - "micrometre": "micrometer", - "micrometres": "micrometers", - "militarise": "militarize", - "militarised": "militarized", - "militarises": "militarizes", - "militarising": "militarizing", - "milligramme": "milligram", - "milligrammes": "milligrams", - "millilitre": "milliliter", - "millilitres": "milliliters", - "millimetre": "millimeter", - "millimetres": "millimeters", - "miniaturisation": "miniaturization", - "miniaturise": "miniaturize", - "miniaturised": "miniaturized", - "miniaturises": "miniaturizes", - "miniaturising": "miniaturizing", - "minibuses": "minibusses ", - "minimise": "minimize", - "minimised": "minimized", - "minimises": "minimizes", - "minimising": "minimizing", - "misbehaviour": "misbehavior", - "misdemeanour": "misdemeanor", - "misdemeanours": "misdemeanors", - "misspelt": "misspelled ", - "mitre": "miter", - "mitres": "miters", - "mobilisation": "mobilization", - "mobilise": "mobilize", - "mobilised": "mobilized", - "mobilises": "mobilizes", - "mobilising": "mobilizing", - "modelled": "modeled", - "modeller": "modeler", - "modellers": "modelers", - "modelling": "modeling", - "modernise": "modernize", - "modernised": "modernized", - "modernises": "modernizes", - "modernising": "modernizing", - "moisturise": "moisturize", - "moisturised": "moisturized", - "moisturiser": "moisturizer", - "moisturisers": "moisturizers", - "moisturises": "moisturizes", - "moisturising": "moisturizing", - "monologue": "monolog", - "monologues": "monologs", - "monopolisation": "monopolization", - "monopolise": "monopolize", - "monopolised": "monopolized", - "monopolises": "monopolizes", - "monopolising": "monopolizing", - "moralise": "moralize", - "moralised": "moralized", - "moralises": "moralizes", - "moralising": "moralizing", - "motorised": "motorized", - "mould": "mold", - "moulded": "molded", - "moulder": "molder", - "mouldered": "moldered", - "mouldering": "moldering", - "moulders": "molders", - "mouldier": "moldier", - "mouldiest": "moldiest", - "moulding": "molding", - "mouldings": "moldings", - "moulds": "molds", - "mouldy": "moldy", - "moult": "molt", - "moulted": "molted", - "moulting": "molting", - "moults": "molts", - "moustache": "mustache", - "moustached": "mustached", - "moustaches": "mustaches", - "moustachioed": "mustachioed", - "multicoloured": "multicolored", - "nationalisation": "nationalization", - "nationalisations": "nationalizations", - "nationalise": "nationalize", - "nationalised": "nationalized", - "nationalises": "nationalizes", - "nationalising": "nationalizing", - "naturalisation": "naturalization", - "naturalise": "naturalize", - "naturalised": "naturalized", - "naturalises": "naturalizes", - "naturalising": "naturalizing", - "neighbour": "neighbor", - "neighbourhood": "neighborhood", - "neighbourhoods": "neighborhoods", - "neighbouring": "neighboring", - "neighbourliness": "neighborliness", - "neighbourly": "neighborly", - "neighbours": "neighbors", - "neutralisation": "neutralization", - "neutralise": "neutralize", - "neutralised": "neutralized", - "neutralises": "neutralizes", - "neutralising": "neutralizing", - "normalisation": "normalization", - "normalise": "normalize", - "normalised": "normalized", - "normalises": "normalizes", - "normalising": "normalizing", - "odour": "odor", - "odourless": "odorless", - "odours": "odors", - "oesophagus": "esophagus", - "oesophaguses": "esophaguses", - "oestrogen": "estrogen", - "offence": "offense", - "offences": "offenses", - "omelette": "omelet", - "omelettes": "omelets", - "optimise": "optimize", - "optimised": "optimized", - "optimises": "optimizes", - "optimising": "optimizing", - "organisation": "organization", - "organisational": "organizational", - "organisations": "organizations", - "organise": "organize", - "organised": "organized", - "organiser": "organizer", - "organisers": "organizers", - "organises": "organizes", - "organising": "organizing", - "orthopaedic": "orthopedic", - "orthopaedics": "orthopedics", - "ostracise": "ostracize", - "ostracised": "ostracized", - "ostracises": "ostracizes", - "ostracising": "ostracizing", - "outmanoeuvre": "outmaneuver", - "outmanoeuvred": "outmaneuvered", - "outmanoeuvres": "outmaneuvers", - "outmanoeuvring": "outmaneuvering", - "overemphasise": "overemphasize", - "overemphasised": "overemphasized", - "overemphasises": "overemphasizes", - "overemphasising": "overemphasizing", - "oxidisation": "oxidization", - "oxidise": "oxidize", - "oxidised": "oxidized", - "oxidises": "oxidizes", - "oxidising": "oxidizing", - "paederast": "pederast", - "paederasts": "pederasts", - "paediatric": "pediatric", - "paediatrician": "pediatrician", - "paediatricians": "pediatricians", - "paediatrics": "pediatrics", - "paedophile": "pedophile", - "paedophiles": "pedophiles", - "paedophilia": "pedophilia", - "palaeolithic": "paleolithic", - "palaeontologist": "paleontologist", - "palaeontologists": "paleontologists", - "palaeontology": "paleontology", - "panelled": "paneled", - "panelling": "paneling", - "panellist": "panelist", - "panellists": "panelists", - "paralyse": "paralyze", - "paralysed": "paralyzed", - "paralyses": "paralyzes", - "paralysing": "paralyzing", - "parcelled": "parceled", - "parcelling": "parceling", - "parlour": "parlor", - "parlours": "parlors", - "particularise": "particularize", - "particularised": "particularized", - "particularises": "particularizes", - "particularising": "particularizing", - "passivisation": "passivization", - "passivise": "passivize", - "passivised": "passivized", - "passivises": "passivizes", - "passivising": "passivizing", - "pasteurisation": "pasteurization", - "pasteurise": "pasteurize", - "pasteurised": "pasteurized", - "pasteurises": "pasteurizes", - "pasteurising": "pasteurizing", - "patronise": "patronize", - "patronised": "patronized", - "patronises": "patronizes", - "patronising": "patronizing", - "patronisingly": "patronizingly", - "pedalled": "pedaled", - "pedalling": "pedaling", - "pedestrianisation": "pedestrianization", - "pedestrianise": "pedestrianize", - "pedestrianised": "pedestrianized", - "pedestrianises": "pedestrianizes", - "pedestrianising": "pedestrianizing", - "penalise": "penalize", - "penalised": "penalized", - "penalises": "penalizes", - "penalising": "penalizing", - "pencilled": "penciled", - "pencilling": "penciling", - "personalise": "personalize", - "personalised": "personalized", - "personalises": "personalizes", - "personalising": "personalizing", - "pharmacopoeia": "pharmacopeia", - "pharmacopoeias": "pharmacopeias", - "philosophise": "philosophize", - "philosophised": "philosophized", - "philosophises": "philosophizes", - "philosophising": "philosophizing", - "philtre": "filter", - "philtres": "filters", - "phoney ": "phony ", - "plagiarise": "plagiarize", - "plagiarised": "plagiarized", - "plagiarises": "plagiarizes", - "plagiarising": "plagiarizing", - "plough": "plow", - "ploughed": "plowed", - "ploughing": "plowing", - "ploughman": "plowman", - "ploughmen": "plowmen", - "ploughs": "plows", - "ploughshare": "plowshare", - "ploughshares": "plowshares", - "polarisation": "polarization", - "polarise": "polarize", - "polarised": "polarized", - "polarises": "polarizes", - "polarising": "polarizing", - "politicisation": "politicization", - "politicise": "politicize", - "politicised": "politicized", - "politicises": "politicizes", - "politicising": "politicizing", - "popularisation": "popularization", - "popularise": "popularize", - "popularised": "popularized", - "popularises": "popularizes", - "popularising": "popularizing", - "pouffe": "pouf", - "pouffes": "poufs", - "practise": "practice", - "practised": "practiced", - "practises": "practices", - "practising ": "practicing ", - "praesidium": "presidium", - "praesidiums ": "presidiums ", - "pressurisation": "pressurization", - "pressurise": "pressurize", - "pressurised": "pressurized", - "pressurises": "pressurizes", - "pressurising": "pressurizing", - "pretence": "pretense", - "pretences": "pretenses", - "primaeval": "primeval", - "prioritisation": "prioritization", - "prioritise": "prioritize", - "prioritised": "prioritized", - "prioritises": "prioritizes", - "prioritising": "prioritizing", - "privatisation": "privatization", - "privatisations": "privatizations", - "privatise": "privatize", - "privatised": "privatized", - "privatises": "privatizes", - "privatising": "privatizing", - "professionalisation": "professionalization", - "professionalise": "professionalize", - "professionalised": "professionalized", - "professionalises": "professionalizes", - "professionalising": "professionalizing", - "programme": "program", - "programmes": "programs", - "prologue": "prolog", - "prologues": "prologs", - "propagandise": "propagandize", - "propagandised": "propagandized", - "propagandises": "propagandizes", - "propagandising": "propagandizing", - "proselytise": "proselytize", - "proselytised": "proselytized", - "proselytiser": "proselytizer", - "proselytisers": "proselytizers", - "proselytises": "proselytizes", - "proselytising": "proselytizing", - "psychoanalyse": "psychoanalyze", - "psychoanalysed": "psychoanalyzed", - "psychoanalyses": "psychoanalyzes", - "psychoanalysing": "psychoanalyzing", - "publicise": "publicize", - "publicised": "publicized", - "publicises": "publicizes", - "publicising": "publicizing", - "pulverisation": "pulverization", - "pulverise": "pulverize", - "pulverised": "pulverized", - "pulverises": "pulverizes", - "pulverising": "pulverizing", - "pummelled": "pummel", - "pummelling": "pummeled", - "pyjama": "pajama", - "pyjamas": "pajamas", - "pzazz": "pizzazz", - "quarrelled": "quarreled", - "quarrelling": "quarreling", - "radicalise": "radicalize", - "radicalised": "radicalized", - "radicalises": "radicalizes", - "radicalising": "radicalizing", - "rancour": "rancor", - "randomise": "randomize", - "randomised": "randomized", - "randomises": "randomizes", - "randomising": "randomizing", - "rationalisation": "rationalization", - "rationalisations": "rationalizations", - "rationalise": "rationalize", - "rationalised": "rationalized", - "rationalises": "rationalizes", - "rationalising": "rationalizing", - "ravelled": "raveled", - "ravelling": "raveling", - "realisable": "realizable", - "realisation": "realization", - "realisations": "realizations", - "realise": "realize", - "realised": "realized", - "realises": "realizes", - "realising": "realizing", - "recognisable": "recognizable", - "recognisably": "recognizably", - "recognisance": "recognizance", - "recognise": "recognize", - "recognised": "recognized", - "recognises": "recognizes", - "recognising": "recognizing", - "reconnoitre": "reconnoiter", - "reconnoitred": "reconnoitered", - "reconnoitres": "reconnoiters", - "reconnoitring": "reconnoitering", - "refuelled": "refueled", - "refuelling": "refueling", - "regularisation": "regularization", - "regularise": "regularize", - "regularised": "regularized", - "regularises": "regularizes", - "regularising": "regularizing", - "remodelled": "remodeled", - "remodelling": "remodeling", - "remould": "remold", - "remoulded": "remolded", - "remoulding": "remolding", - "remoulds": "remolds", - "reorganisation": "reorganization", - "reorganisations": "reorganizations", - "reorganise": "reorganize", - "reorganised": "reorganized", - "reorganises": "reorganizes", - "reorganising": "reorganizing", - "revelled": "reveled", - "reveller": "reveler", - "revellers": "revelers", - "revelling": "reveling", - "revitalise": "revitalize", - "revitalised": "revitalized", - "revitalises": "revitalizes", - "revitalising": "revitalizing", - "revolutionise": "revolutionize", - "revolutionised": "revolutionized", - "revolutionises": "revolutionizes", - "revolutionising": "revolutionizing", - "rhapsodise": "rhapsodize", - "rhapsodised": "rhapsodized", - "rhapsodises": "rhapsodizes", - "rhapsodising": "rhapsodizing", - "rigour": "rigor", - "rigours": "rigors", - "ritualised": "ritualized", - "rivalled": "rivaled", - "rivalling": "rivaling", - "romanticise": "romanticize", - "romanticised": "romanticized", - "romanticises": "romanticizes", - "romanticising": "romanticizing", - "rumour": "rumor", - "rumoured": "rumored", - "rumours": "rumors", - "sabre": "saber", - "sabres": "sabers", - "saltpetre": "saltpeter", - "sanitise": "sanitize", - "sanitised": "sanitized", - "sanitises": "sanitizes", - "sanitising": "sanitizing", - "satirise": "satirize", - "satirised": "satirized", - "satirises": "satirizes", - "satirising": "satirizing", - "saviour": "savior", - "saviours": "saviors", - "savour": "savor", - "savoured": "savored", - "savouries": "savories", - "savouring": "savoring", - "savours": "savors", - "savoury": "savory", - "scandalise": "scandalize", - "scandalised": "scandalized", - "scandalises": "scandalizes", - "scandalising": "scandalizing", - "sceptic": "skeptic", - "sceptical": "skeptical", - "sceptically": "skeptically", - "scepticism": "skepticism", - "sceptics": "skeptics", - "sceptre": "scepter", - "sceptres": "scepters", - "scrutinise": "scrutinize", - "scrutinised": "scrutinized", - "scrutinises": "scrutinizes", - "scrutinising": "scrutinizing", - "secularisation": "secularization", - "secularise": "secularize", - "secularised": "secularized", - "secularises": "secularizes", - "secularising": "secularizing", - "sensationalise": "sensationalize", - "sensationalised": "sensationalized", - "sensationalises": "sensationalizes", - "sensationalising": "sensationalizing", - "sensitise": "sensitize", - "sensitised": "sensitized", - "sensitises": "sensitizes", - "sensitising": "sensitizing", - "sentimentalise": "sentimentalize", - "sentimentalised": "sentimentalized", - "sentimentalises": "sentimentalizes", - "sentimentalising": "sentimentalizing", - "sepulchre": "sepulcher", - "sepulchres": "sepulchers ", - "serialisation": "serialization", - "serialisations": "serializations", - "serialise": "serialize", - "serialised": "serialized", - "serialises": "serializes", - "serialising": "serializing", - "sermonise": "sermonize", - "sermonised": "sermonized", - "sermonises": "sermonizes", - "sermonising": "sermonizing", - "sheikh ": "sheik ", - "shovelled": "shoveled", - "shovelling": "shoveling", - "shrivelled": "shriveled", - "shrivelling": "shriveling", - "signalise": "signalize", - "signalised": "signalized", - "signalises": "signalizes", - "signalising": "signalizing", - "signalled": "signaled", - "signalling": "signaling", - "smoulder": "smolder", - "smouldered": "smoldered", - "smouldering": "smoldering", - "smoulders": "smolders", - "snivelled": "sniveled", - "snivelling": "sniveling", - "snorkelled": "snorkeled", - "snorkelling": "snorkeling", - "snowplough": "snowplow", - "snowploughs": "snowplow", - "socialisation": "socialization", - "socialise": "socialize", - "socialised": "socialized", - "socialises": "socializes", - "socialising": "socializing", - "sodomise": "sodomize", - "sodomised": "sodomized", - "sodomises": "sodomizes", - "sodomising": "sodomizing", - "solemnise": "solemnize", - "solemnised": "solemnized", - "solemnises": "solemnizes", - "solemnising": "solemnizing", - "sombre": "somber", - "specialisation": "specialization", - "specialisations": "specializations", - "specialise": "specialize", - "specialised": "specialized", - "specialises": "specializes", - "specialising": "specializing", - "spectre": "specter", - "spectres": "specters", - "spiralled": "spiraled", - "spiralling": "spiraling", - "splendour": "splendor", - "splendours": "splendors", - "squirrelled": "squirreled", - "squirrelling": "squirreling", - "stabilisation": "stabilization", - "stabilise": "stabilize", - "stabilised": "stabilized", - "stabiliser": "stabilizer", - "stabilisers": "stabilizers", - "stabilises": "stabilizes", - "stabilising": "stabilizing", - "standardisation": "standardization", - "standardise": "standardize", - "standardised": "standardized", - "standardises": "standardizes", - "standardising": "standardizing", - "stencilled": "stenciled", - "stencilling": "stenciling", - "sterilisation": "sterilization", - "sterilisations": "sterilizations", - "sterilise": "sterilize", - "sterilised": "sterilized", - "steriliser": "sterilizer", - "sterilisers": "sterilizers", - "sterilises": "sterilizes", - "sterilising": "sterilizing", - "stigmatisation": "stigmatization", - "stigmatise": "stigmatize", - "stigmatised": "stigmatized", - "stigmatises": "stigmatizes", - "stigmatising": "stigmatizing", - "storey": "story", - "storeys": "stories", - "subsidisation": "subsidization", - "subsidise": "subsidize", - "subsidised": "subsidized", - "subsidiser": "subsidizer", - "subsidisers": "subsidizers", - "subsidises": "subsidizes", - "subsidising": "subsidizing", - "succour": "succor", - "succoured": "succored", - "succouring": "succoring", - "succours": "succors", - "sulphate": "sulfate", - "sulphates": "sulfates", - "sulphide": "sulfide", - "sulphides": "sulfides", - "sulphur": "sulfur", - "sulphurous": "sulfurous", - "summarise": "summarize", - "summarised": "summarized", - "summarises": "summarizes", - "summarising": "summarizing", - "swivelled": "swiveled", - "swivelling": "swiveling", - "symbolise": "symbolize", - "symbolised": "symbolized", - "symbolises": "symbolizes", - "symbolising": "symbolizing", - "sympathise": "sympathize", - "sympathised": "sympathized", - "sympathiser": "sympathizer", - "sympathisers": "sympathizers", - "sympathises": "sympathizes", - "sympathising": "sympathizing", - "synchronisation": "synchronization", - "synchronise": "synchronize", - "synchronised": "synchronized", - "synchronises": "synchronizes", - "synchronising": "synchronizing", - "synthesise": "synthesize", - "synthesised": "synthesized", - "synthesiser": "synthesizer", - "synthesisers": "synthesizers", - "synthesises": "synthesizes", - "synthesising": "synthesizing", - "syphon": "siphon", - "syphoned": "siphoned", - "syphoning": "siphoning", - "syphons": "siphons", - "systematisation": "systematization", - "systematise": "systematize", - "systematised": "systematized", - "systematises": "systematizes", - "systematising": "systematizing", - "tantalise": "tantalize", - "tantalised": "tantalized", - "tantalises": "tantalizes", - "tantalising": "tantalizing", - "tantalisingly": "tantalizingly", - "tasselled": "tasseled", - "technicolour": "technicolor", - "temporise": "temporize", - "temporised": "temporized", - "temporises": "temporizes", - "temporising": "temporizing", - "tenderise": "tenderize", - "tenderised": "tenderized", - "tenderises": "tenderizes", - "tenderising": "tenderizing", - "terrorise": "terrorize", - "terrorised": "terrorized", - "terrorises": "terrorizes", - "terrorising": "terrorizing", - "theatre": "theater", - "theatregoer": "theatergoer", - "theatregoers": "theatergoers", - "theatres": "theaters", - "theorise": "theorize", - "theorised": "theorized", - "theorises": "theorizes", - "theorising": "theorizing", - "tonne": "ton", - "tonnes": "tons", - "towelled": "toweled", - "towelling": "toweling", - "toxaemia": "toxemia", - "tranquillise": "tranquilize", - "tranquillised": "tranquilized", - "tranquilliser": "tranquilizer", - "tranquillisers": "tranquilizers", - "tranquillises": "tranquilizes", - "tranquillising": "tranquilizing", - "tranquillity": "tranquility", - "tranquillize": "tranquilize", - "tranquillized": "tranquilized", - "tranquillizer": "tranquilizer", - "tranquillizers": "tranquilizers", - "tranquillizes": "tranquilizes", - "tranquillizing": "tranquilizing", - "tranquilly": "tranquility", - "transistorised": "transistorized", - "traumatise": "traumatize", - "traumatised": "traumatized", - "traumatises": "traumatizes", - "traumatising": "traumatizing", - "travelled": "traveled", - "traveller": "traveler", - "travellers": "travelers", - "travelling": "traveling", - "travelogue": "travelog", - "travelogues ": "travelogs ", - "trialled": "trialed", - "trialling": "trialing", - "tricolour": "tricolor", - "tricolours": "tricolors", - "trivialise": "trivialize", - "trivialised": "trivialized", - "trivialises": "trivializes", - "trivialising": "trivializing", - "tumour": "tumor", - "tumours": "tumors", - "tunnelled": "tunneled", - "tunnelling": "tunneling", - "tyrannise": "tyrannize", - "tyrannised": "tyrannized", - "tyrannises": "tyrannizes", - "tyrannising": "tyrannizing", - "tyre": "tire", - "tyres": "tires", - "unauthorised": "unauthorized", - "uncivilised": "uncivilized", - "underutilised": "underutilized", - "unequalled": "unequaled", - "unfavourable": "unfavorable", - "unfavourably": "unfavorably", - "unionisation": "unionization", - "unionise": "unionize", - "unionised": "unionized", - "unionises": "unionizes", - "unionising": "unionizing", - "unorganised": "unorganized", - "unravelled": "unraveled", - "unravelling": "unraveling", - "unrecognisable": "unrecognizable", - "unrecognised": "unrecognized", - "unrivalled": "unrivaled", - "unsavoury": "unsavory", - "untrammelled": "untrammeled", - "urbanisation": "urbanization", - "urbanise": "urbanize", - "urbanised": "urbanized", - "urbanises": "urbanizes", - "urbanising": "urbanizing", - "utilisable": "utilizable", - "utilisation": "utilization", - "utilise": "utilize", - "utilised": "utilized", - "utilises": "utilizes", - "utilising": "utilizing", - "valour": "valor", - "vandalise": "vandalize", - "vandalised": "vandalized", - "vandalises": "vandalizes", - "vandalising": "vandalizing", - "vaporisation": "vaporization", - "vaporise": "vaporize", - "vaporised": "vaporized", - "vaporises": "vaporizes", - "vaporising": "vaporizing", - "vapour": "vapor", - "vapours": "vapors", - "verbalise": "verbalize", - "verbalised": "verbalized", - "verbalises": "verbalizes", - "verbalising": "verbalizing", - "victimisation": "victimization", - "victimise": "victimize", - "victimised": "victimized", - "victimises": "victimizes", - "victimising": "victimizing", - "videodisc": "videodisk", - "videodiscs": "videodisks", - "vigour": "vigor", - "visualisation": "visualization", - "visualisations": "visualizations", - "visualise": "visualize", - "visualised": "visualized", - "visualises": "visualizes", - "visualising": "visualizing", - "vocalisation": "vocalization", - "vocalisations": "vocalizations", - "vocalise": "vocalize", - "vocalised": "vocalized", - "vocalises": "vocalizes", - "vocalising": "vocalizing", - "vulcanised": "vulcanized", - "vulgarisation": "vulgarization", - "vulgarise": "vulgarize", - "vulgarised": "vulgarized", - "vulgarises": "vulgarizes", - "vulgarising": "vulgarizing", - "waggon": "wagon", - "waggons": "wagons", - "watercolour": "watercolor", - "watercolours": "watercolors", - "weaselled": "weaseled", - "weaselling": "weaseling", - "westernisation": "westernization", - "westernise": "westernize", - "westernised": "westernized", - "westernises": "westernizes", - "westernising": "westernizing", - "womanise": "womanize", - "womanised": "womanized", - "womaniser": "womanizer", - "womanisers": "womanizers", - "womanises": "womanizes", - "womanising": "womanizing", - "woollen": "woolen", - "woollens": "woolens", - "woollies": "woolies", - "woolly": "wooly", - "worshipped ": "worshiped", - "worshipping ": "worshiping ", - "worshipper": "worshiper", - "yodelled": "yodeled", - "yodelling": "yodeling", - "yoghourt": "yogurt", - "yoghourts": "yogurts", - "yoghurt": "yogurt", - "yoghurts": "yogurts", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ea8e355ac..8e2266a40 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -4,25 +4,20 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class IndonesianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "id" lex_attr_getters.update(LEX_ATTRS) - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py deleted file mode 100644 index 09ac6a6d3..000000000 --- a/spacy/lang/id/norm_exceptions.py +++ /dev/null @@ -1,532 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# Daftar kosakata yang sering salah dieja -# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja -_exc = { - # Slang and abbreviations - "silahkan": "silakan", - "yg": "yang", - "kalo": "kalau", - "cawu": "caturwulan", - "ok": "oke", - "gak": "tidak", - "enggak": "tidak", - "nggak": "tidak", - "ndak": "tidak", - "ngga": "tidak", - "dgn": "dengan", - "tdk": "tidak", - "jg": "juga", - "klo": "kalau", - "denger": "dengar", - "pinter": "pintar", - "krn": "karena", - "nemuin": "menemukan", - "jgn": "jangan", - "udah": "sudah", - "sy": "saya", - "udh": "sudah", - "dapetin": "mendapatkan", - "ngelakuin": "melakukan", - "ngebuat": "membuat", - "membikin": "membuat", - "bikin": "buat", - # Daftar kosakata yang sering salah dieja - "malpraktik": "malapraktik", - "malfungsi": "malafungsi", - "malserap": "malaserap", - "maladaptasi": "malaadaptasi", - "malsuai": "malasuai", - "maldistribusi": "maladistribusi", - "malgizi": "malagizi", - "malsikap": "malasikap", - "memperhatikan": "memerhatikan", - "akte": "akta", - "cemilan": "camilan", - "esei": "esai", - "frase": "frasa", - "kafeteria": "kafetaria", - "ketapel": "katapel", - "kenderaan": "kendaraan", - "menejemen": "manajemen", - "menejer": "manajer", - "mesjid": "masjid", - "rebo": "rabu", - "seksama": "saksama", - "senggama": "sanggama", - "sekedar": "sekadar", - "seprei": "seprai", - "semedi": "semadi", - "samadi": "semadi", - "amandemen": "amendemen", - "algoritma": "algoritme", - "aritmatika": "aritmetika", - "metoda": "metode", - "materai": "meterai", - "meterei": "meterai", - "kalendar": "kalender", - "kadaluwarsa": "kedaluwarsa", - "katagori": "kategori", - "parlamen": "parlemen", - "sekular": "sekuler", - "selular": "seluler", - "sirkular": "sirkuler", - "survai": "survei", - "survey": "survei", - "aktuil": "aktual", - "formil": "formal", - "trotoir": "trotoar", - "komersiil": "komersial", - "komersil": "komersial", - "tradisionil": "tradisionial", - "orisinil": "orisinal", - "orijinil": "orisinal", - "afdol": "afdal", - "antri": "antre", - "apotik": "apotek", - "atlit": "atlet", - "atmosfir": "atmosfer", - "cidera": "cedera", - "cendikiawan": "cendekiawan", - "cepet": "cepat", - "cinderamata": "cenderamata", - "debet": "debit", - "difinisi": "definisi", - "dekrit": "dekret", - "disain": "desain", - "diskripsi": "deskripsi", - "diskotik": "diskotek", - "eksim": "eksem", - "exim": "eksem", - "faidah": "faedah", - "ekstrim": "ekstrem", - "ekstrimis": "ekstremis", - "komplit": "komplet", - "konkrit": "konkret", - "kongkrit": "konkret", - "kongkret": "konkret", - "kridit": "kredit", - "musium": "museum", - "pinalti": "penalti", - "piranti": "peranti", - "pinsil": "pensil", - "personil": "personel", - "sistim": "sistem", - "teoritis": "teoretis", - "vidio": "video", - "cengkeh": "cengkih", - "desertasi": "disertasi", - "hakekat": "hakikat", - "intelejen": "intelijen", - "kaedah": "kaidah", - "kempes": "kempis", - "kementrian": "kementerian", - "ledeng": "leding", - "nasehat": "nasihat", - "penasehat": "penasihat", - "praktek": "praktik", - "praktekum": "praktikum", - "resiko": "risiko", - "retsleting": "ritsleting", - "senen": "senin", - "amuba": "ameba", - "punggawa": "penggawa", - "surban": "serban", - "nomer": "nomor", - "sorban": "serban", - "bis": "bus", - "agribisnis": "agrobisnis", - "kantung": "kantong", - "khutbah": "khotbah", - "mandur": "mandor", - "rubuh": "roboh", - "pastur": "pastor", - "supir": "sopir", - "goncang": "guncang", - "goa": "gua", - "kaos": "kaus", - "kokoh": "kukuh", - "komulatif": "kumulatif", - "kolomnis": "kolumnis", - "korma": "kurma", - "lobang": "lubang", - "limo": "limusin", - "limosin": "limusin", - "mangkok": "mangkuk", - "saos": "saus", - "sop": "sup", - "sorga": "surga", - "tegor": "tegur", - "telor": "telur", - "obrak-abrik": "ubrak-abrik", - "ekwivalen": "ekuivalen", - "frekwensi": "frekuensi", - "konsekwensi": "konsekuensi", - "kwadran": "kuadran", - "kwadrat": "kuadrat", - "kwalifikasi": "kualifikasi", - "kwalitas": "kualitas", - "kwalitet": "kualitas", - "kwalitatif": "kualitatif", - "kwantitas": "kuantitas", - "kwantitatif": "kuantitatif", - "kwantum": "kuantum", - "kwartal": "kuartal", - "kwintal": "kuintal", - "kwitansi": "kuitansi", - "kwatir": "khawatir", - "kuatir": "khawatir", - "jadual": "jadwal", - "hirarki": "hierarki", - "karir": "karier", - "aktip": "aktif", - "daptar": "daftar", - "efektip": "efektif", - "epektif": "efektif", - "epektip": "efektif", - "Pebruari": "Februari", - "pisik": "fisik", - "pondasi": "fondasi", - "photo": "foto", - "photokopi": "fotokopi", - "hapal": "hafal", - "insap": "insaf", - "insyaf": "insaf", - "konperensi": "konferensi", - "kreatip": "kreatif", - "kreativ": "kreatif", - "maap": "maaf", - "napsu": "nafsu", - "negatip": "negatif", - "negativ": "negatif", - "objektip": "objektif", - "obyektip": "objektif", - "obyektif": "objektif", - "pasip": "pasif", - "pasiv": "pasif", - "positip": "positif", - "positiv": "positif", - "produktip": "produktif", - "produktiv": "produktif", - "sarap": "saraf", - "sertipikat": "sertifikat", - "subjektip": "subjektif", - "subyektip": "subjektif", - "subyektif": "subjektif", - "tarip": "tarif", - "transitip": "transitif", - "transitiv": "transitif", - "faham": "paham", - "fikir": "pikir", - "berfikir": "berpikir", - "telefon": "telepon", - "telfon": "telepon", - "telpon": "telepon", - "tilpon": "telepon", - "nafas": "napas", - "bernafas": "bernapas", - "pernafasan": "pernapasan", - "vermak": "permak", - "vulpen": "pulpen", - "aktifis": "aktivis", - "konfeksi": "konveksi", - "motifasi": "motivasi", - "Nopember": "November", - "propinsi": "provinsi", - "babtis": "baptis", - "jerembab": "jerembap", - "lembab": "lembap", - "sembab": "sembap", - "saptu": "sabtu", - "tekat": "tekad", - "bejad": "bejat", - "nekad": "nekat", - "otoped": "otopet", - "skuad": "skuat", - "jenius": "genius", - "marjin": "margin", - "marjinal": "marginal", - "obyek": "objek", - "subyek": "subjek", - "projek": "proyek", - "azas": "asas", - "ijasah": "ijazah", - "jenasah": "jenazah", - "plasa": "plaza", - "bathin": "batin", - "Katholik": "Katolik", - "orthografi": "ortografi", - "pathogen": "patogen", - "theologi": "teologi", - "ijin": "izin", - "rejeki": "rezeki", - "rejim": "rezim", - "jaman": "zaman", - "jamrud": "zamrud", - "jinah": "zina", - "perjinahan": "perzinaan", - "anugrah": "anugerah", - "cendrawasih": "cenderawasih", - "jendral": "jenderal", - "kripik": "keripik", - "krupuk": "kerupuk", - "ksatria": "kesatria", - "mentri": "menteri", - "negri": "negeri", - "Prancis": "Perancis", - "sebrang": "seberang", - "menyebrang": "menyeberang", - "Sumatra": "Sumatera", - "trampil": "terampil", - "isteri": "istri", - "justeru": "justru", - "perajurit": "prajurit", - "putera": "putra", - "puteri": "putri", - "samudera": "samudra", - "sastera": "sastra", - "sutera": "sutra", - "terompet": "trompet", - "iklas": "ikhlas", - "iktisar": "ikhtisar", - "kafilah": "khafilah", - "kawatir": "khawatir", - "kotbah": "khotbah", - "kusyuk": "khusyuk", - "makluk": "makhluk", - "mahluk": "makhluk", - "mahkluk": "makhluk", - "nahkoda": "nakhoda", - "nakoda": "nakhoda", - "tahta": "takhta", - "takhyul": "takhayul", - "tahyul": "takhayul", - "tahayul": "takhayul", - "akhli": "ahli", - "anarkhi": "anarki", - "kharisma": "karisma", - "kharismatik": "karismatik", - "mahsud": "maksud", - "makhsud": "maksud", - "rakhmat": "rahmat", - "tekhnik": "teknik", - "tehnik": "teknik", - "tehnologi": "teknologi", - "ikhwal": "ihwal", - "expor": "ekspor", - "extra": "ekstra", - "komplex": "komplek", - "sex": "seks", - "taxi": "taksi", - "extasi": "ekstasi", - "syaraf": "saraf", - "syurga": "surga", - "mashur": "masyhur", - "masyur": "masyhur", - "mahsyur": "masyhur", - "mashyur": "masyhur", - "muadzin": "muazin", - "adzan": "azan", - "ustadz": "ustaz", - "ustad": "ustaz", - "ustadzah": "ustaz", - "dzikir": "zikir", - "dzuhur": "zuhur", - "dhuhur": "zuhur", - "zhuhur": "zuhur", - "analisa": "analisis", - "diagnosa": "diagnosis", - "hipotesa": "hipotesis", - "sintesa": "sintesis", - "aktiviti": "aktivitas", - "aktifitas": "aktivitas", - "efektifitas": "efektivitas", - "komuniti": "komunitas", - "kreatifitas": "kreativitas", - "produktifitas": "produktivitas", - "realiti": "realitas", - "realita": "realitas", - "selebriti": "selebritas", - "spotifitas": "sportivitas", - "universiti": "universitas", - "utiliti": "utilitas", - "validiti": "validitas", - "dilokalisir": "dilokalisasi", - "didramatisir": "didramatisasi", - "dipolitisir": "dipolitisasi", - "dinetralisir": "dinetralisasi", - "dikonfrontir": "dikonfrontasi", - "mendominir": "mendominasi", - "koordinir": "koordinasi", - "proklamir": "proklamasi", - "terorganisir": "terorganisasi", - "terealisir": "terealisasi", - "robah": "ubah", - "dirubah": "diubah", - "merubah": "mengubah", - "terlanjur": "telanjur", - "terlantar": "telantar", - "penglepasan": "pelepasan", - "pelihatan": "penglihatan", - "pemukiman": "permukiman", - "pengrumahan": "perumahan", - "penyewaan": "persewaan", - "menyintai": "mencintai", - "menyolok": "mencolok", - "contek": "sontek", - "mencontek": "menyontek", - "pungkir": "mungkir", - "dipungkiri": "dimungkiri", - "kupungkiri": "kumungkiri", - "kaupungkiri": "kaumungkiri", - "nampak": "tampak", - "nampaknya": "tampaknya", - "nongkrong": "tongkrong", - "berternak": "beternak", - "berterbangan": "beterbangan", - "berserta": "beserta", - "berperkara": "beperkara", - "berpergian": "bepergian", - "berkerja": "bekerja", - "berberapa": "beberapa", - "terbersit": "tebersit", - "terpercaya": "tepercaya", - "terperdaya": "teperdaya", - "terpercik": "tepercik", - "terpergok": "tepergok", - "aksesoris": "aksesori", - "handal": "andal", - "hantar": "antar", - "panutan": "anutan", - "atsiri": "asiri", - "bhakti": "bakti", - "china": "cina", - "dharma": "darma", - "diktaktor": "diktator", - "eksport": "ekspor", - "hembus": "embus", - "hadits": "hadis", - "hadist": "hadits", - "harafiah": "harfiah", - "himbau": "imbau", - "import": "impor", - "inget": "ingat", - "hisap": "isap", - "interprestasi": "interpretasi", - "kangker": "kanker", - "konggres": "kongres", - "lansekap": "lanskap", - "maghrib": "magrib", - "emak": "mak", - "moderen": "modern", - "pasport": "paspor", - "perduli": "peduli", - "ramadhan": "ramadan", - "rapih": "rapi", - "Sansekerta": "Sanskerta", - "shalat": "salat", - "sholat": "salat", - "silahkan": "silakan", - "standard": "standar", - "hutang": "utang", - "zinah": "zina", - "ambulan": "ambulans", - "antartika": "sntarktika", - "arteri": "arteria", - "asik": "asyik", - "australi": "australia", - "denga": "dengan", - "depo": "depot", - "detil": "detail", - "ensiklopedi": "ensiklopedia", - "elit": "elite", - "frustasi": "frustrasi", - "gladi": "geladi", - "greget": "gereget", - "itali": "italia", - "karna": "karena", - "klenteng": "kelenteng", - "erling": "kerling", - "kontruksi": "konstruksi", - "masal": "massal", - "merk": "merek", - "respon": "respons", - "diresponi": "direspons", - "skak": "sekak", - "stir": "setir", - "singapur": "singapura", - "standarisasi": "standardisasi", - "varitas": "varietas", - "amphibi": "amfibi", - "anjlog": "anjlok", - "alpukat": "avokad", - "alpokat": "avokad", - "bolpen": "pulpen", - "cabe": "cabai", - "cabay": "cabai", - "ceret": "cerek", - "differensial": "diferensial", - "duren": "durian", - "faksimili": "faksimile", - "faksimil": "faksimile", - "graha": "gerha", - "goblog": "goblok", - "gombrong": "gombroh", - "horden": "gorden", - "korden": "gorden", - "gubug": "gubuk", - "imaginasi": "imajinasi", - "jerigen": "jeriken", - "jirigen": "jeriken", - "carut-marut": "karut-marut", - "kwota": "kuota", - "mahzab": "mazhab", - "mempesona": "memesona", - "milyar": "miliar", - "missi": "misi", - "nenas": "nanas", - "negoisasi": "negosiasi", - "automotif": "otomotif", - "pararel": "paralel", - "paska": "pasca", - "prosen": "persen", - "pete": "petai", - "petay": "petai", - "proffesor": "profesor", - "rame": "ramai", - "rapot": "rapor", - "rileks": "relaks", - "rileksasi": "relaksasi", - "renumerasi": "remunerasi", - "seketaris": "sekretaris", - "sekertaris": "sekretaris", - "sensorik": "sensoris", - "sentausa": "sentosa", - "strawberi": "stroberi", - "strawbery": "stroberi", - "taqwa": "takwa", - "tauco": "taoco", - "tauge": "taoge", - "toge": "taoge", - "tauladan": "teladan", - "taubat": "tobat", - "trilyun": "triliun", - "vissi": "visi", - "coklat": "cokelat", - "narkotika": "narkotik", - "oase": "oasis", - "politisi": "politikus", - "terong": "terung", - "wool": "wol", - "himpit": "impit", - "mujizat": "mukjizat", - "mujijat": "mukjizat", - "yag": "yang", -} - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 4fcfaddb4..8d85b8fc7 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -2,26 +2,21 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class LuxembourgishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "lb" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py deleted file mode 100644 index 7063e6863..000000000 --- a/spacy/lang/lb/norm_exceptions.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# TODO -# norm execptions: find a possibility to deal with the zillions of spelling -# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) -# here one could include the most common spelling mistakes - -_exc = {"dass": "datt", "viläicht": "vläicht"} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 7c0ed8a04..c9cd82d7b 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -186,10 +186,6 @@ def suffix(string): return string[-3:] -def cluster(string): - return 0 - - def is_alpha(string): return string.isalpha() @@ -218,20 +214,11 @@ def is_stop(string, stops=set()): return string.lower() in stops -def is_oov(string): - return True - - -def get_prob(string): - return -20.0 - - LEX_ATTRS = { attrs.LOWER: lower, attrs.NORM: lower, attrs.PREFIX: prefix, attrs.SUFFIX: suffix, - attrs.CLUSTER: cluster, attrs.IS_ALPHA: is_alpha, attrs.IS_DIGIT: is_digit, attrs.IS_LOWER: is_lower, @@ -239,8 +226,6 @@ LEX_ATTRS = { attrs.IS_TITLE: is_title, attrs.IS_UPPER: is_upper, attrs.IS_STOP: is_stop, - attrs.IS_OOV: is_oov, - attrs.PROB: get_prob, attrs.LIKE_EMAIL: like_email, attrs.LIKE_NUM: like_num, attrs.IS_PUNCT: is_punct, diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index f786d6542..c09996126 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -5,22 +5,17 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP -from .norm_exceptions import NORM_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class PortugueseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "pt" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py deleted file mode 100644 index ea650cb31..000000000 --- a/spacy/lang/pt/norm_exceptions.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# These exceptions are used to add NORM values based on a token's ORTH value. -# Individual languages can also add their own exceptions and overwrite them - -# for example, British vs. American spelling in English. - -# Norms are only set if no alternative is provided in the tokenizer exceptions. -# Note that this does not change any other token attributes. Its main purpose -# is to normalise the word representations so that equivalent tokens receive -# similar representations. For example: $ and € are very different, but they're -# both currency symbols. By normalising currency symbols to $, all symbols are -# seen as similar, no matter how common they are in the training data. - - -NORM_EXCEPTIONS = { - "R$": "$", # Real - "r$": "$", # Real - "Cz$": "$", # Cruzado - "cz$": "$", # Cruzado - "NCz$": "$", # Cruzado Novo - "ncz$": "$", # Cruzado Novo -} diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index f34fc5435..f0e77d811 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -3,26 +3,21 @@ from __future__ import unicode_literals, print_function from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .lemmatizer import RussianLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS -from ...util import update_exc, add_lookups +from ...util import update_exc from ...language import Language from ...lookups import Lookups -from ...attrs import LANG, NORM +from ...attrs import LANG class RussianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "ru" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py deleted file mode 100644 index 43e08948c..000000000 --- a/spacy/lang/ru/norm_exceptions.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Slang - "прив": "привет", - "дарова": "привет", - "дак": "так", - "дык": "так", - "здарова": "привет", - "пакедава": "пока", - "пакедаво": "пока", - "ща": "сейчас", - "спс": "спасибо", - "пжлст": "пожалуйста", - "плиз": "пожалуйста", - "ладненько": "ладно", - "лады": "ладно", - "лан": "ладно", - "ясн": "ясно", - "всм": "всмысле", - "хош": "хочешь", - "хаюшки": "привет", - "оч": "очень", - "че": "что", - "чо": "что", - "шо": "что", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index f27b87102..286d6693b 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -3,22 +3,17 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...attrs import LANG +from ...util import update_exc class SerbianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "sr" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py deleted file mode 100644 index 69f2c3173..000000000 --- a/spacy/lang/sr/norm_exceptions.py +++ /dev/null @@ -1,26 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Slang - "ћале": "отац", - "кева": "мајка", - "смор": "досада", - "кец": "јединица", - "тебра": "брат", - "штребер": "ученик", - "факс": "факултет", - "профа": "професор", - "бус": "аутобус", - "пискарало": "службеник", - "бакутанер": "бака", - "џибер": "простак", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py deleted file mode 100644 index fbdceb98c..000000000 --- a/spacy/lang/ta/norm_exceptions.py +++ /dev/null @@ -1,139 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -_exc = { - # Regional words normal - # Sri Lanka - wikipeadia - "இங்க": "இங்கே", - "வாங்க": "வாருங்கள்", - "ஒண்டு": "ஒன்று", - "கண்டு": "கன்று", - "கொண்டு": "கொன்று", - "பண்டி": "பன்றி", - "பச்ச": "பச்சை", - "அம்பது": "ஐம்பது", - "வெச்ச": "வைத்து", - "வச்ச": "வைத்து", - "வச்சி": "வைத்து", - "வாளைப்பழம்": "வாழைப்பழம்", - "மண்ணு": "மண்", - "பொன்னு": "பொன்", - "சாவல்": "சேவல்", - "அங்கால": "அங்கு ", - "அசுப்பு": "நடமாட்டம்", - "எழுவான் கரை": "எழுவான்கரை", - "ஓய்யாரம்": "எழில் ", - "ஒளும்பு": "எழும்பு", - "ஓர்மை": "துணிவு", - "கச்சை": "கோவணம்", - "கடப்பு": "தெருவாசல்", - "சுள்ளி": "காய்ந்த குச்சி", - "திறாவுதல்": "தடவுதல்", - "நாசமறுப்பு": "தொல்லை", - "பரிசாரி": "வைத்தியன்", - "பறவாதி": "பேராசைக்காரன்", - "பிசினி": "உலோபி ", - "விசர்": "பைத்தியம்", - "ஏனம்": "பாத்திரம்", - "ஏலா": "இயலாது", - "ஒசில்": "அழகு", - "ஒள்ளுப்பம்": "கொஞ்சம்", - # Srilankan and indian - "குத்துமதிப்பு": "", - "நூனாயம்": "நூல்நயம்", - "பைய": "மெதுவாக", - "மண்டை": "தலை", - "வெள்ளனே": "சீக்கிரம்", - "உசுப்பு": "எழுப்பு", - "ஆணம்": "குழம்பு", - "உறக்கம்": "தூக்கம்", - "பஸ்": "பேருந்து", - "களவு": "திருட்டு ", - # relationship - "புருசன்": "கணவன்", - "பொஞ்சாதி": "மனைவி", - "புள்ள": "பிள்ளை", - "பிள்ள": "பிள்ளை", - "ஆம்பிளப்புள்ள": "ஆண் பிள்ளை", - "பொம்பிளப்புள்ள": "பெண் பிள்ளை", - "அண்ணாச்சி": "அண்ணா", - "அக்காச்சி": "அக்கா", - "தங்கச்சி": "தங்கை", - # difference words - "பொடியன்": "சிறுவன்", - "பொட்டை": "சிறுமி", - "பிறகு": "பின்பு", - "டக்கென்டு": "விரைவாக", - "கெதியா": "விரைவாக", - "கிறுகி": "திரும்பி", - "போயித்து வாறன்": "போய் வருகிறேன்", - "வருவாங்களா": "வருவார்களா", - # regular spokens - "சொல்லு": "சொல்", - "கேளு": "கேள்", - "சொல்லுங்க": "சொல்லுங்கள்", - "கேளுங்க": "கேளுங்கள்", - "நீங்கள்": "நீ", - "உன்": "உன்னுடைய", - # Portugeese formal words - "அலவாங்கு": "கடப்பாரை", - "ஆசுப்பத்திரி": "மருத்துவமனை", - "உரோதை": "சில்லு", - "கடுதாசி": "கடிதம்", - "கதிரை": "நாற்காலி", - "குசினி": "அடுக்களை", - "கோப்பை": "கிண்ணம்", - "சப்பாத்து": "காலணி", - "தாச்சி": "இரும்புச் சட்டி", - "துவாய்": "துவாலை", - "தவறணை": "மதுக்கடை", - "பீப்பா": "மரத்தாழி", - "யன்னல்": "சாளரம்", - "வாங்கு": "மரஇருக்கை", - # Dutch formal words - "இறாக்கை": "பற்சட்டம்", - "இலாட்சி": "இழுப்பறை", - "கந்தோர்": "பணிமனை", - "நொத்தாரிசு": "ஆவண எழுத்துபதிவாளர்", - # English formal words - "இஞ்சினியர்": "பொறியியலாளர்", - "சூப்பு": "ரசம்", - "செக்": "காசோலை", - "சேட்டு": "மேற்ச்சட்டை", - "மார்க்கட்டு": "சந்தை", - "விண்ணன்": "கெட்டிக்காரன்", - # Arabic formal words - "ஈமான்": "நம்பிக்கை", - "சுன்னத்து": "விருத்தசேதனம்", - "செய்த்தான்": "பிசாசு", - "மவுத்து": "இறப்பு", - "ஹலால்": "அங்கீகரிக்கப்பட்டது", - "கறாம்": "நிராகரிக்கப்பட்டது", - # Persian, Hindustanian and hindi formal words - "சுமார்": "கிட்டத்தட்ட", - "சிப்பாய்": "போர்வீரன்", - "சிபார்சு": "சிபாரிசு", - "ஜமீன்": "பணக்காரா்", - "அசல்": "மெய்யான", - "அந்தஸ்து": "கௌரவம்", - "ஆஜர்": "சமா்ப்பித்தல்", - "உசார்": "எச்சரிக்கை", - "அச்சா": "நல்ல", - # English words used in text conversations - "bcoz": "ஏனெனில்", - "bcuz": "ஏனெனில்", - "fav": "விருப்பமான", - "morning": "காலை வணக்கம்", - "gdeveng": "மாலை வணக்கம்", - "gdnyt": "இரவு வணக்கம்", - "gdnit": "இரவு வணக்கம்", - "plz": "தயவு செய்து", - "pls": "தயவு செய்து", - "thx": "நன்றி", - "thanx": "நன்றி", -} - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 06970fbd7..512be0c59 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -4,14 +4,12 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS -from .norm_exceptions import NORM_EXCEPTIONS from .lex_attrs import LEX_ATTRS -from ..norm_exceptions import BASE_NORMS -from ...attrs import LANG, NORM +from ...attrs import LANG from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer, add_lookups +from ...util import DummyTokenizer class ThaiTokenizer(DummyTokenizer): @@ -37,9 +35,6 @@ class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda _text: "th" - lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS - ) tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py deleted file mode 100644 index ed1b3e760..000000000 --- a/spacy/lang/th/norm_exceptions.py +++ /dev/null @@ -1,113 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -_exc = { - # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) - "สนุ๊กเกอร์": "สนุกเกอร์", - "โน้ต": "โน้ต", - # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ) - "โทสับ": "โทรศัพท์", - "พุ่งนี้": "พรุ่งนี้", - # Strange (ให้ดูแปลกตา) - "ชะมะ": "ใช่ไหม", - "ชิมิ": "ใช่ไหม", - "ชะ": "ใช่ไหม", - "ช่ายมะ": "ใช่ไหม", - "ป่าว": "เปล่า", - "ป่ะ": "เปล่า", - "ปล่าว": "เปล่า", - "คัย": "ใคร", - "ไค": "ใคร", - "คราย": "ใคร", - "เตง": "ตัวเอง", - "ตะเอง": "ตัวเอง", - "รึ": "หรือ", - "เหรอ": "หรือ", - "หรา": "หรือ", - "หรอ": "หรือ", - "ชั้น": "ฉัน", - "ชั้ล": "ฉัน", - "ช้าน": "ฉัน", - "เทอ": "เธอ", - "เทอร์": "เธอ", - "เทอว์": "เธอ", - "แกร": "แก", - "ป๋ม": "ผม", - "บ่องตง": "บอกตรงๆ", - "ถ่ามตง": "ถามตรงๆ", - "ต่อมตง": "ตอบตรงๆ", - "เพิ่ล": "เพื่อน", - "จอบอ": "จอบอ", - "ดั้ย": "ได้", - "ขอบคุง": "ขอบคุณ", - "ยังงัย": "ยังไง", - "Inw": "เทพ", - "uou": "นอน", - "Lกรีeu": "เกรียน", - # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์) - "เปงราย": "เป็นอะไร", - "เปนรัย": "เป็นอะไร", - "เปงรัย": "เป็นอะไร", - "เป็นอัลไล": "เป็นอะไร", - "ทามมาย": "ทำไม", - "ทามมัย": "ทำไม", - "จังรุย": "จังเลย", - "จังเยย": "จังเลย", - "จุงเบย": "จังเลย", - "ไม่รู้": "มะรุ", - "เฮ่ย": "เฮ้ย", - "เห้ย": "เฮ้ย", - "น่าร็อค": "น่ารัก", - "น่าร๊าก": "น่ารัก", - "ตั้ลล๊าก": "น่ารัก", - "คือร๊ะ": "คืออะไร", - "โอป่ะ": "โอเคหรือเปล่า", - "น่ามคาน": "น่ารำคาญ", - "น่ามสาร": "น่าสงสาร", - "วงวาร": "สงสาร", - "บับว่า": "แบบว่า", - "อัลไล": "อะไร", - "อิจ": "อิจฉา", - # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์) - "กรู": "กู", - "กุ": "กู", - "กรุ": "กู", - "ตู": "กู", - "ตรู": "กู", - "มรึง": "มึง", - "เมิง": "มึง", - "มืง": "มึง", - "มุง": "มึง", - "สาด": "สัตว์", - "สัส": "สัตว์", - "สัก": "สัตว์", - "แสรด": "สัตว์", - "โคโตะ": "โคตร", - "โคด": "โคตร", - "โครต": "โคตร", - "โคตะระ": "โคตร", - "พ่อง": "พ่อมึง", - "แม่เมิง": "แม่มึง", - "เชี่ย": "เหี้ย", - # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร) - "แอร๊ยย": "อ๊าย", - "อร๊ายยย": "อ๊าย", - "มันส์": "มัน", - "วู๊วววววววว์": "วู้", - # Acronym (แบบคำย่อ) - "หมาลัย": "มหาวิทยาลัย", - "วิดวะ": "วิศวะ", - "สินสาด ": "ศิลปศาสตร์", - "สินกำ ": "ศิลปกรรมศาสตร์", - "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ", - "เมกา ": "อเมริกา", - "มอไซค์ ": "มอเตอร์ไซค์", -} - - -NORM_EXCEPTIONS = {} - -for string, norm in _exc.items(): - NORM_EXCEPTIONS[string] = norm - NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/language.py b/spacy/language.py index f23776def..703806627 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,10 +28,11 @@ from .compat import izip, basestring_, is_python2, class_types from .gold import GoldParse from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer -from .attrs import IS_STOP, LANG +from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop @@ -77,6 +78,9 @@ class BaseDefaults(object): lemmatizer=lemmatizer, lookups=lookups, ) + vocab.lex_attr_getters[NORM] = util.add_lookups( + vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm") + ) for tag_str, exc in cls.morph_rules.items(): for orth_str, attrs in exc.items(): vocab.morphology.add_special_case(tag_str, orth_str, attrs) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f31733374..167f57462 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,8 +1,8 @@ from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .attrs cimport attr_id_t -from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG +from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG -from .structs cimport LexemeC, SerializedLexemeC +from .structs cimport LexemeC from .strings cimport StringStore from .vocab cimport Vocab @@ -24,22 +24,6 @@ cdef class Lexeme: self.vocab = vocab self.orth = lex.orth - @staticmethod - cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil: - cdef SerializedLexemeC lex_data - buff = &lex.flags - end = &lex.sentiment + sizeof(lex.sentiment) - for i in range(sizeof(lex_data.data)): - lex_data.data[i] = buff[i] - return lex_data - - @staticmethod - cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: - buff = &lex.flags - end = &lex.sentiment + sizeof(lex.sentiment) - for i in range(sizeof(lex_data.data)): - buff[i] = lex_data.data[i] - @staticmethod cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: if name < (sizeof(flags_t) * 8): @@ -56,8 +40,6 @@ cdef class Lexeme: lex.prefix = value elif name == SUFFIX: lex.suffix = value - elif name == CLUSTER: - lex.cluster = value elif name == LANG: lex.lang = value @@ -84,8 +66,6 @@ cdef class Lexeme: return lex.suffix elif feat_name == LENGTH: return lex.length - elif feat_name == CLUSTER: - return lex.cluster elif feat_name == LANG: return lex.lang else: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index a081ffe42..dec2993fa 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,7 +17,7 @@ from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT -from .attrs cimport IS_CURRENCY, IS_OOV, PROB +from .attrs cimport IS_CURRENCY from .attrs import intify_attrs from .errors import Errors, Warnings @@ -89,12 +89,11 @@ cdef class Lexeme: cdef attr_id_t attr attrs = intify_attrs(attrs) for attr, value in attrs.items(): - if attr == PROB: - self.c.prob = value - elif attr == CLUSTER: - self.c.cluster = int(value) - elif isinstance(value, int) or isinstance(value, long): - Lexeme.set_struct_attr(self.c, attr, value) + # skip PROB, e.g. from lexemes.jsonl + if isinstance(value, float): + continue + elif isinstance(value, (int, long)): + Lexeme.set_struct_attr(self.c, attr, value) else: Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value)) @@ -137,34 +136,6 @@ cdef class Lexeme: xp = get_array_module(vector) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - def to_bytes(self): - lex_data = Lexeme.c_to_bytes(self.c) - start = &self.c.flags - end = &self.c.sentiment + sizeof(self.c.sentiment) - if (end-start) != sizeof(lex_data.data): - raise ValueError(Errors.E072.format(length=end-start, - bad_length=sizeof(lex_data.data))) - byte_string = b"\0" * sizeof(lex_data.data) - byte_chars = byte_string - for i in range(sizeof(lex_data.data)): - byte_chars[i] = lex_data.data[i] - if len(byte_string) != sizeof(lex_data.data): - raise ValueError(Errors.E072.format(length=len(byte_string), - bad_length=sizeof(lex_data.data))) - return byte_string - - def from_bytes(self, bytes byte_string): - # This method doesn't really have a use-case --- wrote it for testing. - # Possibly delete? It puts the Lexeme out of synch with the vocab. - cdef SerializedLexemeC lex_data - if len(byte_string) != sizeof(lex_data.data): - raise ValueError(Errors.E072.format(length=len(byte_string), - bad_length=sizeof(lex_data.data))) - for i in range(len(byte_string)): - lex_data.data[i] = byte_string[i] - Lexeme.c_from_bytes(self.c, lex_data) - self.orth = self.c.orth - @property def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. @@ -208,10 +179,14 @@ cdef class Lexeme: """RETURNS (float): A scalar value indicating the positivity or negativity of the lexeme.""" def __get__(self): - return self.c.sentiment + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) + return sentiment_table.get(self.c.orth, 0.0) - def __set__(self, float sentiment): - self.c.sentiment = sentiment + def __set__(self, float x): + if "lexeme_sentiment" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_sentiment") + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") + sentiment_table[self.c.orth] = x @property def orth_(self): @@ -238,9 +213,13 @@ cdef class Lexeme: lexeme text. """ def __get__(self): - return self.c.norm + return self.c.norm def __set__(self, attr_t x): + if "lexeme_norm" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_norm") + norm_table = self.vocab.lookups.get_table("lexeme_norm") + norm_table[self.c.orth] = self.vocab.strings[x] self.c.norm = x property shape: @@ -276,10 +255,12 @@ cdef class Lexeme: property cluster: """RETURNS (int): Brown cluster ID.""" def __get__(self): - return self.c.cluster + cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + return cluster_table.get(self.c.orth, 0) - def __set__(self, attr_t x): - self.c.cluster = x + def __set__(self, int x): + cluster_table = self.vocab.load_extra_lookups("lexeme_cluster") + cluster_table[self.c.orth] = x property lang: """RETURNS (uint64): Language of the parent vocabulary.""" @@ -293,10 +274,14 @@ cdef class Lexeme: """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" def __get__(self): - return self.c.prob + prob_table = self.vocab.load_extra_lookups("lexeme_prob") + settings_table = self.vocab.load_extra_lookups("lexeme_settings") + default_oov_prob = settings_table.get("oov_prob", -20.0) + return prob_table.get(self.c.orth, default_oov_prob) def __set__(self, float x): - self.c.prob = x + prob_table = self.vocab.load_extra_lookups("lexeme_prob") + prob_table[self.c.orth] = x property lower_: """RETURNS (unicode): Lowercase form of the word.""" @@ -314,7 +299,7 @@ cdef class Lexeme: return self.vocab.strings[self.c.norm] def __set__(self, unicode x): - self.c.norm = self.vocab.strings.add(x) + self.norm = self.vocab.strings.add(x) property shape_: """RETURNS (unicode): Transform of the word's string, to show @@ -362,13 +347,10 @@ cdef class Lexeme: def __set__(self, flags_t x): self.c.flags = x - property is_oov: + @property + def is_oov(self): """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_OOV) - - def __set__(self, attr_t x): - Lexeme.c_set_flag(self.c, IS_OOV, x) + return self.orth in self.vocab.vectors property is_stop: """RETURNS (bool): Whether the lexeme is a stop word.""" diff --git a/spacy/lookups.py b/spacy/lookups.py index bf250b4b4..1fa29bdfe 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -124,7 +124,7 @@ class Lookups(object): self._tables[key].update(value) return self - def to_disk(self, path, **kwargs): + def to_disk(self, path, filename="lookups.bin", **kwargs): """Save the lookups to a directory as lookups.bin. Expects a path to a directory, which will be created if it doesn't exist. @@ -136,11 +136,11 @@ class Lookups(object): path = ensure_path(path) if not path.exists(): path.mkdir() - filepath = path / "lookups.bin" + filepath = path / filename with filepath.open("wb") as file_: file_.write(self.to_bytes()) - def from_disk(self, path, **kwargs): + def from_disk(self, path, filename="lookups.bin", **kwargs): """Load lookups from a directory containing a lookups.bin. Will skip loading if the file doesn't exist. @@ -150,7 +150,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#from_disk """ path = ensure_path(path) - filepath = path / "lookups.bin" + filepath = path / filename if filepath.exists(): with filepath.open("rb") as file_: data = file_.read() diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b8e63a725..1f5f32675 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -23,29 +23,6 @@ cdef struct LexemeC: attr_t prefix attr_t suffix - attr_t cluster - - float prob - float sentiment - - -cdef struct SerializedLexemeC: - unsigned char[8 + 8*10 + 4 + 4] data - # sizeof(flags_t) # flags - # + sizeof(attr_t) # lang - # + sizeof(attr_t) # id - # + sizeof(attr_t) # length - # + sizeof(attr_t) # orth - # + sizeof(attr_t) # lower - # + sizeof(attr_t) # norm - # + sizeof(attr_t) # shape - # + sizeof(attr_t) # prefix - # + sizeof(attr_t) # suffix - # + sizeof(attr_t) # cluster - # + sizeof(float) # prob - # + sizeof(float) # cluster - # + sizeof(float) # l2_norm - cdef struct SpanC: hash_t id diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 9229c9970..ebb87c8d2 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -12,7 +12,7 @@ cdef enum symbol_t: LIKE_NUM LIKE_EMAIL IS_STOP - IS_OOV + IS_OOV_DEPRECATED IS_BRACKET IS_QUOTE IS_LEFT_PUNCT diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index e438caba5..83a9d0482 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -17,7 +17,7 @@ IDS = { "LIKE_NUM": LIKE_NUM, "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, - "IS_OOV": IS_OOV, + "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED, "IS_BRACKET": IS_BRACKET, "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index 837ceb323..503399ee4 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -37,14 +37,6 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): assert tokens[7].text == "." -@pytest.mark.parametrize( - "text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")] -) -def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): - tokens = da_tokenizer(text) - assert tokens[0].norm_ == norm - - @pytest.mark.parametrize( "text,n_tokens", [ diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index 2e065870e..3b464e1ae 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -22,17 +22,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer): assert len(tokens) == 6 assert tokens[2].text == "z.Zt." assert tokens[2].lemma_ == "zur Zeit" - - -@pytest.mark.parametrize( - "text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])] -) -def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms): - tokens = de_tokenizer(text) - assert [token.norm_ for token in tokens] == norms - - -@pytest.mark.parametrize("text,norm", [("daß", "dass")]) -def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm): - tokens = de_tokenizer(text) - assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 6285a9408..a78e1815f 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -118,6 +118,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): assert [token.norm_ for token in tokens] == norms +@pytest.mark.skip @pytest.mark.parametrize( "text,norm", [("radicalised", "radicalized"), ("cuz", "because")] ) diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 7ca2394b7..ebfab75cf 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -22,9 +22,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): assert len(tokens) == 9 assert tokens[1].text == "'t" assert tokens[1].lemma_ == "et" - - -@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) -def test_lb_norm_exceptions(lb_tokenizer, text, norm): - tokens = lb_tokenizer(text) - assert tokens[0].norm_ == norm diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 1671845ee..63faf44fc 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import pickle from spacy.vocab import Vocab from spacy.strings import StringStore @@ -36,8 +37,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1) == len(strings1) - assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1) + assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP + assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"]) @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -51,12 +52,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab2.to_disk(file_path2) vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) - assert list(vocab1_d) == list(vocab1) - assert list(vocab2_d) == list(vocab2) + # check strings rather than lexemes, which are only reloaded on demand + assert strings1 == [s for s in vocab1_d.strings if s != "_SP"] + assert strings2 == [s for s in vocab2_d.strings if s != "_SP"] if strings1 == strings2: - assert list(vocab1_d) == list(vocab2_d) + assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"] else: - assert list(vocab1_d) != list(vocab2_d) + assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"] @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -76,7 +78,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): vocab = Vocab(strings=strings) length = len(vocab) vocab.from_bytes(vocab.to_bytes()) - assert len(vocab) == length + assert len(vocab.strings) == len(strings) + 1 # adds _SP @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -127,3 +129,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) == list(sstore2_d) else: assert list(sstore1_d) != list(sstore2_d) + +@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) +def test_pickle_vocab(strings, lex_attr): + vocab = Vocab(strings=strings) + vocab[strings[0]].norm_ = lex_attr + vocab_pickled = pickle.dumps(vocab) + vocab_unpickled = pickle.loads(vocab_pickled) + assert vocab.to_bytes() == vocab_unpickled.to_bytes() diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 701222afc..bcda2999a 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -26,7 +26,7 @@ def test_lemmatizer_reflects_lookups_changes(): nlp_bytes = nlp.to_bytes() new_nlp.from_bytes(nlp_bytes) # Make sure we have the previously saved lookup table - assert len(new_nlp.vocab.lookups) == 1 + assert "lemma_lookup" in new_nlp.vocab.lookups assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2 assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world" assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar" diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index b57c6705a..af73a79bf 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -60,19 +60,6 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab["dogs"].check_flag(is_len4) is True -def test_lexeme_bytes_roundtrip(en_vocab): - one = en_vocab["one"] - alpha = en_vocab["alpha"] - assert one.orth != alpha.orth - assert one.lower != alpha.lower - alpha.from_bytes(one.to_bytes()) - - assert one.orth_ == alpha.orth_ - assert one.orth == alpha.orth - assert one.lower == alpha.lower - assert one.lower_ == alpha.lower_ - - def test_vocab_lexeme_oov_rank(en_vocab): """Test that default rank is OOV_RANK.""" lex = en_vocab["word"] diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index f78dd33c4..af15e9e91 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -119,12 +119,11 @@ def test_lookups_to_from_bytes_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) - assert len(vocab.lookups) == 1 assert table_name in vocab.lookups vocab_bytes = vocab.to_bytes() new_vocab = Vocab() new_vocab.from_bytes(vocab_bytes) - assert len(new_vocab.lookups) == 1 + assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 @@ -137,13 +136,12 @@ def test_lookups_to_from_disk_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) - assert len(vocab.lookups) == 1 assert table_name in vocab.lookups with make_tempdir() as tmpdir: vocab.to_disk(tmpdir) new_vocab = Vocab() new_vocab.from_disk(tmpdir) - assert len(new_vocab.lookups) == 1 + assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 322ef462a..16d9801ab 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -329,3 +329,15 @@ def test_vocab_prune_vectors(): neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) + + +def test_vector_is_oov(): + vocab = Vocab(vectors_name="test_vocab_is_oov") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + assert vocab["cat"].is_oov is True + assert vocab["dog"].is_oov is True + assert vocab["hamster"].is_oov is False diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b79d2d805..45deebc93 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -17,7 +17,7 @@ from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT -from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL +from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..symbols cimport conj @@ -259,7 +259,7 @@ cdef class Token: @property def prob(self): """RETURNS (float): Smoothed log probability estimate of token type.""" - return self.c.lex.prob + return self.vocab[self.c.lex.orth].prob @property def sentiment(self): @@ -267,7 +267,7 @@ cdef class Token: negativity of the token.""" if "sentiment" in self.doc.user_token_hooks: return self.doc.user_token_hooks["sentiment"](self) - return self.c.lex.sentiment + return self.vocab[self.c.lex.orth].sentiment @property def lang(self): @@ -286,7 +286,7 @@ cdef class Token: @property def cluster(self): """RETURNS (int): Brown cluster ID.""" - return self.c.lex.cluster + return self.vocab[self.c.lex.orth].cluster @property def orth(self): @@ -923,7 +923,7 @@ cdef class Token: @property def is_oov(self): """RETURNS (bool): Whether the token is out-of-vocabulary.""" - return Lexeme.c_check_flag(self.c.lex, IS_OOV) + return self.c.lex.orth in self.vocab.vectors @property def is_stop(self): diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index d989d6c40..73754eb02 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -30,6 +30,7 @@ cdef class Vocab: cpdef public Morphology morphology cpdef public object vectors cpdef public object lookups + cpdef public object lookups_extra cdef readonly int length cdef public object data_dir cdef public object lex_attr_getters diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ef2e86bcc..68f0ac0db 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -11,8 +11,7 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK from .lexeme cimport Lexeme from .typedefs cimport attr_t from .tokens.token cimport Token -from .attrs cimport PROB, LANG, ORTH, TAG, POS -from .structs cimport SerializedLexemeC +from .attrs cimport LANG, ORTH, TAG, POS from .compat import copy_reg, basestring_ from .errors import Errors @@ -22,6 +21,8 @@ from .vectors import Vectors from ._ml import link_vectors_to_models from .lookups import Lookups from . import util +from .lang.norm_exceptions import BASE_NORMS +from .lang.lex_attrs import LEX_ATTRS cdef class Vocab: @@ -32,8 +33,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, - **deprecated_kwargs): + strings=tuple(), lookups=None, lookups_extra=None, + oov_prob=-20., vectors_name=None, **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -44,6 +45,7 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. + lookups_extra (Lookups): Container for optional lookup tables and dictionaries. name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ @@ -51,8 +53,12 @@ cdef class Vocab: tag_map = tag_map if tag_map is not None else {} if lookups in (None, True, False): lookups = Lookups() + if "lexeme_norm" not in lookups: + lookups.add_table("lexeme_norm") if lemmatizer in (None, True, False): lemmatizer = Lemmatizer(lookups) + if lookups_extra in (None, True, False): + lookups_extra = Lookups() self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() @@ -65,6 +71,7 @@ cdef class Vocab: self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.vectors = Vectors(name=vectors_name) self.lookups = lookups + self.lookups_extra = lookups_extra @property def lang(self): @@ -173,9 +180,7 @@ cdef class Vocab: value = func(string) if isinstance(value, unicode): value = self.strings.add(value) - if attr == PROB: - lex.prob = value - elif value is not None: + if value is not None: Lexeme.set_struct_attr(lex, attr, value) if not is_oov: self._add_lex_to_vocab(lex.orth, lex) @@ -435,17 +440,16 @@ cdef class Vocab: path = util.ensure_path(path) if not path.exists(): path.mkdir() - setters = ["strings", "lexemes", "vectors"] + setters = ["strings", "vectors"] exclude = util.get_serialization_exclude(setters, exclude, kwargs) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") - if "lexemes" not in exclude: - with (path / "lexemes.bin").open("wb") as file_: - file_.write(self.lexemes_to_bytes()) if "vectors" not in "exclude" and self.vectors is not None: self.vectors.to_disk(path) if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) + if "lookups_extra" not in "exclude" and self.lookups_extra is not None: + self.lookups_extra.to_disk(path, filename="lookups_extra.bin") def from_disk(self, path, exclude=tuple(), **kwargs): """Loads state from a directory. Modifies the object in place and @@ -458,13 +462,10 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) - getters = ["strings", "lexemes", "vectors"] + getters = ["strings", "vectors"] exclude = util.get_serialization_exclude(getters, exclude, kwargs) if "strings" not in exclude: self.strings.from_disk(path / "strings.json") # TODO: add exclude? - if "lexemes" not in exclude: - with (path / "lexemes.bin").open("rb") as file_: - self.lexemes_from_bytes(file_.read()) if "vectors" not in exclude: if self.vectors is not None: self.vectors.from_disk(path, exclude=["strings"]) @@ -472,6 +473,14 @@ cdef class Vocab: link_vectors_to_models(self) if "lookups" not in exclude: self.lookups.from_disk(path) + if "lookups_extra" not in exclude: + self.lookups_extra.from_disk(path, filename="lookups_extra.bin") + if "lexeme_norm" in self.lookups: + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") + ) + self.length = 0 + self._by_orth = PreshMap() return self def to_bytes(self, exclude=tuple(), **kwargs): @@ -490,9 +499,9 @@ cdef class Vocab: getters = OrderedDict(( ("strings", lambda: self.strings.to_bytes()), - ("lexemes", lambda: self.lexemes_to_bytes()), ("vectors", deserialize_vectors), - ("lookups", lambda: self.lookups.to_bytes()) + ("lookups", lambda: self.lookups.to_bytes()), + ("lookups_extra", lambda: self.lookups_extra.to_bytes()) )) exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -514,99 +523,62 @@ cdef class Vocab: setters = OrderedDict(( ("strings", lambda b: self.strings.from_bytes(b)), - ("lexemes", lambda b: self.lexemes_from_bytes(b)), ("vectors", lambda b: serialize_vectors(b)), - ("lookups", lambda b: self.lookups.from_bytes(b)) + ("lookups", lambda b: self.lookups.from_bytes(b)), + ("lookups_extra", lambda b: self.lookups_extra.from_bytes(b)) )) exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) + if "lexeme_norm" in self.lookups: + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm") + ) + self.length = 0 + self._by_orth = PreshMap() if self.vectors.name is not None: link_vectors_to_models(self) return self - def lexemes_to_bytes(self): - cdef hash_t key - cdef size_t addr - cdef LexemeC* lexeme = NULL - cdef SerializedLexemeC lex_data - cdef int size = 0 - for key, addr in self._by_orth.items(): - if addr == 0: - continue - size += sizeof(lex_data.data) - byte_string = b"\0" * size - byte_ptr = byte_string - cdef int j - cdef int i = 0 - for key, addr in self._by_orth.items(): - if addr == 0: - continue - lexeme = addr - lex_data = Lexeme.c_to_bytes(lexeme) - for j in range(sizeof(lex_data.data)): - byte_ptr[i] = lex_data.data[j] - i += 1 - return byte_string - - def lexemes_from_bytes(self, bytes bytes_data): - """Load the binary vocabulary data from the given string.""" - cdef LexemeC* lexeme - cdef hash_t key - cdef unicode py_str - cdef int i = 0 - cdef int j = 0 - cdef SerializedLexemeC lex_data - chunk_size = sizeof(lex_data.data) - cdef void* ptr - cdef unsigned char* bytes_ptr = bytes_data - for i in range(0, len(bytes_data), chunk_size): - lexeme = self.mem.alloc(1, sizeof(LexemeC)) - for j in range(sizeof(lex_data.data)): - lex_data.data[j] = bytes_ptr[i+j] - Lexeme.c_from_bytes(lexeme, lex_data) - prev_entry = self._by_orth.get(lexeme.orth) - if prev_entry != NULL: - memcpy(prev_entry, lexeme, sizeof(LexemeC)) - continue - ptr = self.strings._map.get(lexeme.orth) - if ptr == NULL: - continue - py_str = self.strings[lexeme.orth] - if self.strings[py_str] != lexeme.orth: - raise ValueError(Errors.E086.format(string=py_str, - orth_id=lexeme.orth, - hash_id=self.strings[py_str])) - self._by_orth.set(lexeme.orth, lexeme) - self.length += 1 - def _reset_cache(self, keys, strings): # I'm not sure this made sense. Disable it for now. raise NotImplementedError + def load_extra_lookups(self, table_name): + if table_name not in self.lookups_extra: + if self.lang + "_extra" in util.registry.lookups: + tables = util.registry.lookups.get(self.lang + "_extra") + for name, filename in tables.items(): + if table_name == name: + data = util.load_language_data(filename) + self.lookups_extra.add_table(name, data) + if table_name not in self.lookups_extra: + self.lookups_extra.add_table(table_name) + return self.lookups_extra.get_table(table_name) + + def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors morph = vocab.morphology - length = vocab.length data_dir = vocab.data_dir lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) - lexemes_data = vocab.lexemes_to_bytes() + lookups = vocab.lookups + lookups_extra = vocab.lookups_extra return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length)) + (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra)) def unpickle_vocab(sstore, vectors, morphology, data_dir, - lex_attr_getters, bytes lexemes_data, int length): + lex_attr_getters, lookups, lookups_extra): cdef Vocab vocab = Vocab() - vocab.length = length vocab.vectors = vectors vocab.strings = sstore vocab.morphology = morphology vocab.data_dir = data_dir vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) - vocab.lexemes_from_bytes(lexemes_data) - vocab.length = length + vocab.lookups = lookups + vocab.lookups_extra = lookups_extra return vocab From 0061992d958ca56b2e5ba1ebd1fadee402fd65d7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 19 May 2020 15:59:55 +0200 Subject: [PATCH 103/131] Update Polish tokenizer for UD_Polish-PDB (#5432) Update Polish tokenizer for UD_Polish-PDB, which is a relatively major change from the existing tokenizer. Unused exceptions files and conflicting test cases removed. Co-authored-by: Matthew Honnibal --- spacy/lang/pl/__init__.py | 11 +- spacy/lang/pl/_tokenizer_exceptions_list.py | 1443 ------------------- spacy/lang/pl/polish_srx_rules_LICENSE.txt | 23 - spacy/lang/pl/punctuation.py | 36 +- spacy/lang/pl/tokenizer_exceptions.py | 26 - spacy/tests/lang/pl/test_tokenizer.py | 36 +- 6 files changed, 39 insertions(+), 1536 deletions(-) delete mode 100644 spacy/lang/pl/_tokenizer_exceptions_list.py delete mode 100644 spacy/lang/pl/polish_srx_rules_LICENSE.txt delete mode 100644 spacy/lang/pl/tokenizer_exceptions.py diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 0540bf535..61608a3d9 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + mod_base_exceptions = { + exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") + } + tokenizer_exceptions = mod_base_exceptions stop_words = STOP_WORDS tag_map = TAG_MAP + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py deleted file mode 100644 index 839eccb83..000000000 --- a/spacy/lang/pl/_tokenizer_exceptions_list.py +++ /dev/null @@ -1,1443 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - -# The following list consists of: -# - exceptions generated from polish_srx_rules [1] -# (https://github.com/milekpl/polish_srx_rules) -# - abbreviations parsed from Wikipedia -# - some manually added exceptions -# -# [1] M. Miłkowski and J. Lipski, -# "Using SRX Standard for Sentence Segmentation," in LTC 2009, -# Lecture Notes in Artificial Intelligence 6562, -# Z. Vetulani, Ed. Berlin Heidelberg: Springer-Verlag, 2011, pp. 172–182. -PL_BASE_EXCEPTIONS = [ - "0.", - "1.", - "10.", - "2.", - "3.", - "4.", - "5.", - "6.", - "7.", - "8.", - "9.", - "A.A.", - "A.B.", - "A.C.", - "A.D.", - "A.E.", - "A.F.", - "A.G.", - "A.H.", - "A.I.", - "A.J.", - "A.K.", - "A.L.", - "A.M.", - "A.N.", - "A.O.", - "A.P.", - "A.R.", - "A.S.", - "A.T.", - "A.U.", - "A.W.", - "A.Y.", - "A.Z.", - "A.Ó.", - "A.Ą.", - "A.Ć.", - "A.Ę.", - "A.Ł.", - "A.Ń.", - "A.Ś.", - "A.Ź.", - "A.Ż.", - "Ad.", - "Adw.", - "Al.", - "Art.", - "B.A.", - "B.B.", - "B.C.", - "B.D.", - "B.E.", - "B.F.", - "B.G.", - "B.H.", - "B.I.", - "B.J.", - "B.K.", - "B.L.", - "B.M.", - "B.N.", - "B.O.", - "B.P.", - "B.R.", - "B.S.", - "B.T.", - "B.U.", - "B.W.", - "B.Y.", - "B.Z.", - "B.Ó.", - "B.Ą.", - "B.Ć.", - "B.Ę.", - "B.Ł.", - "B.Ń.", - "B.Ś.", - "B.Ź.", - "B.Ż.", - "D.A.", - "D.B.", - "D.C.", - "D.D.", - "D.E.", - "D.F.", - "D.G.", - "D.H.", - "D.I.", - "D.J.", - "D.K.", - "D.L.", - "D.M.", - "D.N.", - "D.O.", - "D.P.", - "D.R.", - "D.S.", - "D.T.", - "D.U.", - "D.W.", - "D.Y.", - "D.Z.", - "D.Ó.", - "D.Ą.", - "D.Ć.", - "D.Ę.", - "D.Ł.", - "D.Ń.", - "D.Ś.", - "D.Ź.", - "D.Ż.", - "Dh.", - "Doc.", - "Dr.", - "Dyr.", - "Dyw.", - "Dz.U.", - "E.A.", - "E.B.", - "E.C.", - "E.D.", - "E.E.", - "E.F.", - "E.G.", - "E.H.", - "E.I.", - "E.J.", - "E.K.", - "E.L.", - "E.M.", - "E.N.", - "E.O.", - "E.P.", - "E.R.", - "E.S.", - "E.T.", - "E.U.", - "E.W.", - "E.Y.", - "E.Z.", - "E.Ó.", - "E.Ą.", - "E.Ć.", - "E.Ę.", - "E.Ł.", - "E.Ń.", - "E.Ś.", - "E.Ź.", - "E.Ż.", - "F.A.", - "F.B.", - "F.C.", - "F.D.", - "F.E.", - "F.F.", - "F.G.", - "F.H.", - "F.I.", - "F.J.", - "F.K.", - "F.L.", - "F.M.", - "F.N.", - "F.O.", - "F.P.", - "F.R.", - "F.S.", - "F.T.", - "F.U.", - "F.W.", - "F.Y.", - "F.Z.", - "F.Ó.", - "F.Ą.", - "F.Ć.", - "F.Ę.", - "F.Ł.", - "F.Ń.", - "F.Ś.", - "F.Ź.", - "F.Ż.", - "G.A.", - "G.B.", - "G.C.", - "G.D.", - "G.E.", - "G.F.", - "G.G.", - "G.H.", - "G.I.", - "G.J.", - "G.K.", - "G.L.", - "G.M.", - "G.N.", - "G.O.", - "G.P.", - "G.R.", - "G.S.", - "G.T.", - "G.U.", - "G.W.", - "G.Y.", - "G.Z.", - "G.Ó.", - "G.Ą.", - "G.Ć.", - "G.Ę.", - "G.Ł.", - "G.Ń.", - "G.Ś.", - "G.Ź.", - "G.Ż.", - "H.A.", - "H.B.", - "H.C.", - "H.D.", - "H.E.", - "H.F.", - "H.G.", - "H.H.", - "H.I.", - "H.J.", - "H.K.", - "H.L.", - "H.M.", - "H.N.", - "H.O.", - "H.P.", - "H.R.", - "H.S.", - "H.T.", - "H.U.", - "H.W.", - "H.Y.", - "H.Z.", - "H.Ó.", - "H.Ą.", - "H.Ć.", - "H.Ę.", - "H.Ł.", - "H.Ń.", - "H.Ś.", - "H.Ź.", - "H.Ż.", - "Hr.", - "I.A.", - "I.B.", - "I.C.", - "I.D.", - "I.E.", - "I.F.", - "I.G.", - "I.H.", - "I.I.", - "I.J.", - "I.K.", - "I.L.", - "I.M.", - "I.N.", - "I.O.", - "I.P.", - "I.R.", - "I.S.", - "I.T.", - "I.U.", - "I.W.", - "I.Y.", - "I.Z.", - "I.Ó.", - "I.Ą.", - "I.Ć.", - "I.Ę.", - "I.Ł.", - "I.Ń.", - "I.Ś.", - "I.Ź.", - "I.Ż.", - "Inż.", - "J.A.", - "J.B.", - "J.C.", - "J.D.", - "J.E.", - "J.F.", - "J.G.", - "J.H.", - "J.I.", - "J.J.", - "J.K.", - "J.L.", - "J.M.", - "J.N.", - "J.O.", - "J.P.", - "J.R.", - "J.S.", - "J.T.", - "J.U.", - "J.W.", - "J.Y.", - "J.Z.", - "J.Ó.", - "J.Ą.", - "J.Ć.", - "J.Ę.", - "J.Ł.", - "J.Ń.", - "J.Ś.", - "J.Ź.", - "J.Ż.", - "K.A.", - "K.B.", - "K.C.", - "K.D.", - "K.E.", - "K.F.", - "K.G.", - "K.H.", - "K.I.", - "K.J.", - "K.K.", - "K.L.", - "K.M.", - "K.N.", - "K.O.", - "K.P.", - "K.R.", - "K.S.", - "K.T.", - "K.U.", - "K.W.", - "K.Y.", - "K.Z.", - "K.Ó.", - "K.Ą.", - "K.Ć.", - "K.Ę.", - "K.Ł.", - "K.Ń.", - "K.Ś.", - "K.Ź.", - "K.Ż.", - "Ks.", - "L.A.", - "L.B.", - "L.C.", - "L.D.", - "L.E.", - "L.F.", - "L.G.", - "L.H.", - "L.I.", - "L.J.", - "L.K.", - "L.L.", - "L.M.", - "L.N.", - "L.O.", - "L.P.", - "L.R.", - "L.S.", - "L.T.", - "L.U.", - "L.W.", - "L.Y.", - "L.Z.", - "L.Ó.", - "L.Ą.", - "L.Ć.", - "L.Ę.", - "L.Ł.", - "L.Ń.", - "L.Ś.", - "L.Ź.", - "L.Ż.", - "Lek.", - "M.A.", - "M.B.", - "M.C.", - "M.D.", - "M.E.", - "M.F.", - "M.G.", - "M.H.", - "M.I.", - "M.J.", - "M.K.", - "M.L.", - "M.M.", - "M.N.", - "M.O.", - "M.P.", - "M.R.", - "M.S.", - "M.T.", - "M.U.", - "M.W.", - "M.Y.", - "M.Z.", - "M.Ó.", - "M.Ą.", - "M.Ć.", - "M.Ę.", - "M.Ł.", - "M.Ń.", - "M.Ś.", - "M.Ź.", - "M.Ż.", - "Mat.", - "Mec.", - "Mojż.", - "N.A.", - "N.B.", - "N.C.", - "N.D.", - "N.E.", - "N.F.", - "N.G.", - "N.H.", - "N.I.", - "N.J.", - "N.K.", - "N.L.", - "N.M.", - "N.N.", - "N.O.", - "N.P.", - "N.R.", - "N.S.", - "N.T.", - "N.U.", - "N.W.", - "N.Y.", - "N.Z.", - "N.Ó.", - "N.Ą.", - "N.Ć.", - "N.Ę.", - "N.Ł.", - "N.Ń.", - "N.Ś.", - "N.Ź.", - "N.Ż.", - "Na os.", - "Nadkom.", - "Najśw.", - "Nb.", - "Np.", - "O.A.", - "O.B.", - "O.C.", - "O.D.", - "O.E.", - "O.F.", - "O.G.", - "O.H.", - "O.I.", - "O.J.", - "O.K.", - "O.L.", - "O.M.", - "O.N.", - "O.O.", - "O.P.", - "O.R.", - "O.S.", - "O.T.", - "O.U.", - "O.W.", - "O.Y.", - "O.Z.", - "O.Ó.", - "O.Ą.", - "O.Ć.", - "O.Ę.", - "O.Ł.", - "O.Ń.", - "O.Ś.", - "O.Ź.", - "O.Ż.", - "OO.", - "Oo.", - "P.A.", - "P.B.", - "P.C.", - "P.D.", - "P.E.", - "P.F.", - "P.G.", - "P.H.", - "P.I.", - "P.J.", - "P.K.", - "P.L.", - "P.M.", - "P.N.", - "P.O.", - "P.P.", - "P.R.", - "P.S.", - "P.T.", - "P.U.", - "P.W.", - "P.Y.", - "P.Z.", - "P.Ó.", - "P.Ą.", - "P.Ć.", - "P.Ę.", - "P.Ł.", - "P.Ń.", - "P.Ś.", - "P.Ź.", - "P.Ż.", - "Podkom.", - "Przyp.", - "Ps.", - "Pt.", - "Płk.", - "R.A.", - "R.B.", - "R.C.", - "R.D.", - "R.E.", - "R.F.", - "R.G.", - "R.H.", - "R.I.", - "R.J.", - "R.K.", - "R.L.", - "R.M.", - "R.N.", - "R.O.", - "R.P.", - "R.R.", - "R.S.", - "R.T.", - "R.U.", - "R.W.", - "R.Y.", - "R.Z.", - "R.Ó.", - "R.Ą.", - "R.Ć.", - "R.Ę.", - "R.Ł.", - "R.Ń.", - "R.Ś.", - "R.Ź.", - "R.Ż.", - "Red.", - "Reż.", - "Ryc.", - "Rys.", - "S.A.", - "S.B.", - "S.C.", - "S.D.", - "S.E.", - "S.F.", - "S.G.", - "S.H.", - "S.I.", - "S.J.", - "S.K.", - "S.L.", - "S.M.", - "S.N.", - "S.O.", - "S.P.", - "S.R.", - "S.S.", - "S.T.", - "S.U.", - "S.W.", - "S.Y.", - "S.Z.", - "S.Ó.", - "S.Ą.", - "S.Ć.", - "S.Ę.", - "S.Ł.", - "S.Ń.", - "S.Ś.", - "S.Ź.", - "S.Ż.", - "Sp.", - "Spółdz.", - "Stow.", - "Stoł.", - "Sz.P.", - "Szer.", - "T.A.", - "T.B.", - "T.C.", - "T.D.", - "T.E.", - "T.F.", - "T.G.", - "T.H.", - "T.I.", - "T.J.", - "T.K.", - "T.L.", - "T.M.", - "T.N.", - "T.O.", - "T.P.", - "T.R.", - "T.S.", - "T.T.", - "T.U.", - "T.W.", - "T.Y.", - "T.Z.", - "T.Ó.", - "T.Ą.", - "T.Ć.", - "T.Ę.", - "T.Ł.", - "T.Ń.", - "T.Ś.", - "T.Ź.", - "T.Ż.", - "Tow.", - "Tzw.", - "U.A.", - "U.B.", - "U.C.", - "U.D.", - "U.E.", - "U.F.", - "U.G.", - "U.H.", - "U.I.", - "U.J.", - "U.K.", - "U.L.", - "U.M.", - "U.N.", - "U.O.", - "U.P.", - "U.R.", - "U.S.", - "U.T.", - "U.U.", - "U.W.", - "U.Y.", - "U.Z.", - "U.Ó.", - "U.Ą.", - "U.Ć.", - "U.Ę.", - "U.Ł.", - "U.Ń.", - "U.Ś.", - "U.Ź.", - "U.Ż.", - "W.A.", - "W.B.", - "W.C.", - "W.D.", - "W.E.", - "W.F.", - "W.G.", - "W.H.", - "W.I.", - "W.J.", - "W.K.", - "W.L.", - "W.M.", - "W.N.", - "W.O.", - "W.P.", - "W.R.", - "W.S.", - "W.T.", - "W.U.", - "W.W.", - "W.Y.", - "W.Z.", - "W.Ó.", - "W.Ą.", - "W.Ć.", - "W.Ę.", - "W.Ł.", - "W.Ń.", - "W.Ś.", - "W.Ź.", - "W.Ż.", - "Y.A.", - "Y.B.", - "Y.C.", - "Y.D.", - "Y.E.", - "Y.F.", - "Y.G.", - "Y.H.", - "Y.I.", - "Y.J.", - "Y.K.", - "Y.L.", - "Y.M.", - "Y.N.", - "Y.O.", - "Y.P.", - "Y.R.", - "Y.S.", - "Y.T.", - "Y.U.", - "Y.W.", - "Y.Y.", - "Y.Z.", - "Y.Ó.", - "Y.Ą.", - "Y.Ć.", - "Y.Ę.", - "Y.Ł.", - "Y.Ń.", - "Y.Ś.", - "Y.Ź.", - "Y.Ż.", - "Z.A.", - "Z.B.", - "Z.C.", - "Z.D.", - "Z.E.", - "Z.F.", - "Z.G.", - "Z.H.", - "Z.I.", - "Z.J.", - "Z.K.", - "Z.L.", - "Z.M.", - "Z.N.", - "Z.O.", - "Z.P.", - "Z.R.", - "Z.S.", - "Z.T.", - "Z.U.", - "Z.W.", - "Z.Y.", - "Z.Z.", - "Z.Ó.", - "Z.Ą.", - "Z.Ć.", - "Z.Ę.", - "Z.Ł.", - "Z.Ń.", - "Z.Ś.", - "Z.Ź.", - "Z.Ż.", - "Zob.", - "a.", - "ad.", - "adw.", - "afr.", - "ags.", - "akad.", - "al.", - "alb.", - "am.", - "amer.", - "ang.", - "aor.", - "ap.", - "apost.", - "arch.", - "arcyks.", - "art.", - "artyst.", - "asp.", - "astr.", - "aust.", - "austr.", - "austral.", - "b.", - "bałt.", - "bdb.", - "belg.", - "białorus.", - "białost.", - "bm.", - "bot.", - "bp.", - "br.", - "bryg.", - "bryt.", - "bułg.", - "bł.", - "c.b.d.o.", - "c.k.", - "c.o.", - "cbdu.", - "cd.", - "cdn.", - "centr.", - "ces.", - "chem.", - "chir.", - "chiń.", - "chor.", - "chorw.", - "cieśn.", - "cnd.", - "cyg.", - "cyt.", - "cyw.", - "cz.", - "czes.", - "czw.", - "czyt.", - "d.", - "daw.", - "dcn.", - "dekl.", - "demokr.", - "det.", - "dh.", - "diec.", - "dk.", - "dn.", - "doc.", - "doktor h.c.", - "dol.", - "dolnośl.", - "dost.", - "dosł.", - "dot.", - "dr h.c.", - "dr hab.", - "dr.", - "ds.", - "dst.", - "duszp.", - "dypl.", - "dyr.", - "dyw.", - "dł.", - "egz.", - "ekol.", - "ekon.", - "elektr.", - "em.", - "ent.", - "est.", - "europ.", - "ew.", - "fab.", - "farm.", - "fot.", - "fr.", - "franc.", - "g.", - "gastr.", - "gat.", - "gd.", - "gen.", - "geogr.", - "geol.", - "gimn.", - "gm.", - "godz.", - "gorz.", - "gosp.", - "gosp.-polit.", - "gr.", - "gram.", - "grub.", - "górn.", - "głęb.", - "h.c.", - "hab.", - "hist.", - "hiszp.", - "hitl.", - "hm.", - "hot.", - "hr.", - "i in.", - "i s.", - "id.", - "ie.", - "im.", - "in.", - "inż.", - "iron.", - "itd.", - "itp.", - "j.", - "j.a.", - "jez.", - "jn.", - "jw.", - "jwt.", - "k.", - "k.k.", - "k.o.", - "k.p.a.", - "k.p.c.", - "k.r.", - "k.r.o.", - "kard.", - "kark.", - "kasz.", - "kat.", - "katol.", - "kier.", - "kk.", - "kl.", - "kol.", - "kpc.", - "kpt.", - "kr.", - "krak.", - "kryt.", - "ks.", - "książk.", - "kuj.", - "kult.", - "kł.", - "l.", - "laic.", - "lek.", - "lit.", - "lp.", - "lub.", - "m.", - "m.b.", - "m.in.", - "m.p.", - "m.st.", - "mar.", - "maz.", - "małop.", - "mec.", - "med.", - "mgr.", - "min.", - "mn.", - "mn.w.", - "muz.", - "mł.", - "n.", - "n.e.", - "n.p.m.", - "n.p.u.", - "na os.", - "nadkom.", - "najśw.", - "nb.", - "niedz.", - "niem.", - "norw.", - "np.", - "nt.", - "nż.", - "o s.", - "o.", - "oO.", - "ob.", - "odc.", - "odp.", - "ok.", - "oo.", - "op.", - "os.", - "p.", - "p.a.", - "p.f.", - "p.f.v.", - "p.n.e.", - "p.o.", - "p.p.", - "p.p.m.", - "p.r.", - "p.r.v.", - "phm.", - "pie.", - "pl.", - "pn.", - "pocz.", - "pod.", - "podgat.", - "podkarp.", - "podkom.", - "poet.", - "poj.", - "pok.", - "pol.", - "pom.", - "pon.", - "poprz.", - "por.", - "port.", - "posp.", - "pow.", - "poz.", - "poł.", - "pp.", - "ppanc.", - "ppor.", - "ppoż.", - "prawdop.", - "proc.", - "prof.", - "prok.", - "przed Chr.", - "przyp.", - "ps.", - "pseud.", - "pt.", - "pw.", - "półn.", - "płd.", - "płk.", - "płn.", - "r.", - "r.ż.", - "red.", - "reż.", - "ros.", - "rozdz.", - "rtg.", - "rtm.", - "rub.", - "rum.", - "ryc.", - "rys.", - "rz.", - "s.", - "serb.", - "sierż.", - "skr.", - "sob.", - "sp.", - "społ.", - "spółdz.", - "spółgł.", - "st.", - "st.rus.", - "stow.", - "stoł.", - "str.", - "sud.", - "szczec.", - "szer.", - "szt.", - "szw.", - "szwajc.", - "słow.", - "t.", - "t.j.", - "tatrz.", - "tel.", - "tj.", - "tow.", - "trl.", - "tryb.", - "ts.", - "tur.", - "tys.", - "tzn.", - "tzw.", - "tłum.", - "u s.", - "ub.", - "ukr.", - "ul.", - "up.", - "ur.", - "v.v.", - "vs.", - "w.", - "warm.", - "wlk.", - "wlkp.", - "woj.", - "wroc.", - "ws.", - "wsch.", - "wt.", - "ww.", - "wyb.", - "wyd.", - "wyj.", - "wym.", - "wyst.", - "wył.", - "wyż.", - "wzgl.", - "wędr.", - "węg.", - "wł.", - "x.", - "xx.", - "zach.", - "zagr.", - "zak.", - "zakł.", - "zal.", - "zam.", - "zast.", - "zaw.", - "zazw.", - "zał.", - "zdr.", - "zew.", - "zewn.", - "ziel.", - "zm.", - "zn.", - "zob.", - "zool.", - "zw.", - "ząbk.", - "Ó.A.", - "Ó.B.", - "Ó.C.", - "Ó.D.", - "Ó.E.", - "Ó.F.", - "Ó.G.", - "Ó.H.", - "Ó.I.", - "Ó.J.", - "Ó.K.", - "Ó.L.", - "Ó.M.", - "Ó.N.", - "Ó.O.", - "Ó.P.", - "Ó.R.", - "Ó.S.", - "Ó.T.", - "Ó.U.", - "Ó.W.", - "Ó.Y.", - "Ó.Z.", - "Ó.Ó.", - "Ó.Ą.", - "Ó.Ć.", - "Ó.Ę.", - "Ó.Ł.", - "Ó.Ń.", - "Ó.Ś.", - "Ó.Ź.", - "Ó.Ż.", - "Ą.A.", - "Ą.B.", - "Ą.C.", - "Ą.D.", - "Ą.E.", - "Ą.F.", - "Ą.G.", - "Ą.H.", - "Ą.I.", - "Ą.J.", - "Ą.K.", - "Ą.L.", - "Ą.M.", - "Ą.N.", - "Ą.O.", - "Ą.P.", - "Ą.R.", - "Ą.S.", - "Ą.T.", - "Ą.U.", - "Ą.W.", - "Ą.Y.", - "Ą.Z.", - "Ą.Ó.", - "Ą.Ą.", - "Ą.Ć.", - "Ą.Ę.", - "Ą.Ł.", - "Ą.Ń.", - "Ą.Ś.", - "Ą.Ź.", - "Ą.Ż.", - "Ć.A.", - "Ć.B.", - "Ć.C.", - "Ć.D.", - "Ć.E.", - "Ć.F.", - "Ć.G.", - "Ć.H.", - "Ć.I.", - "Ć.J.", - "Ć.K.", - "Ć.L.", - "Ć.M.", - "Ć.N.", - "Ć.O.", - "Ć.P.", - "Ć.R.", - "Ć.S.", - "Ć.T.", - "Ć.U.", - "Ć.W.", - "Ć.Y.", - "Ć.Z.", - "Ć.Ó.", - "Ć.Ą.", - "Ć.Ć.", - "Ć.Ę.", - "Ć.Ł.", - "Ć.Ń.", - "Ć.Ś.", - "Ć.Ź.", - "Ć.Ż.", - "ćw.", - "ćwicz.", - "Ę.A.", - "Ę.B.", - "Ę.C.", - "Ę.D.", - "Ę.E.", - "Ę.F.", - "Ę.G.", - "Ę.H.", - "Ę.I.", - "Ę.J.", - "Ę.K.", - "Ę.L.", - "Ę.M.", - "Ę.N.", - "Ę.O.", - "Ę.P.", - "Ę.R.", - "Ę.S.", - "Ę.T.", - "Ę.U.", - "Ę.W.", - "Ę.Y.", - "Ę.Z.", - "Ę.Ó.", - "Ę.Ą.", - "Ę.Ć.", - "Ę.Ę.", - "Ę.Ł.", - "Ę.Ń.", - "Ę.Ś.", - "Ę.Ź.", - "Ę.Ż.", - "Ł.A.", - "Ł.B.", - "Ł.C.", - "Ł.D.", - "Ł.E.", - "Ł.F.", - "Ł.G.", - "Ł.H.", - "Ł.I.", - "Ł.J.", - "Ł.K.", - "Ł.L.", - "Ł.M.", - "Ł.N.", - "Ł.O.", - "Ł.P.", - "Ł.R.", - "Ł.S.", - "Ł.T.", - "Ł.U.", - "Ł.W.", - "Ł.Y.", - "Ł.Z.", - "Ł.Ó.", - "Ł.Ą.", - "Ł.Ć.", - "Ł.Ę.", - "Ł.Ł.", - "Ł.Ń.", - "Ł.Ś.", - "Ł.Ź.", - "Ł.Ż.", - "Łuk.", - "łac.", - "łot.", - "łow.", - "Ń.A.", - "Ń.B.", - "Ń.C.", - "Ń.D.", - "Ń.E.", - "Ń.F.", - "Ń.G.", - "Ń.H.", - "Ń.I.", - "Ń.J.", - "Ń.K.", - "Ń.L.", - "Ń.M.", - "Ń.N.", - "Ń.O.", - "Ń.P.", - "Ń.R.", - "Ń.S.", - "Ń.T.", - "Ń.U.", - "Ń.W.", - "Ń.Y.", - "Ń.Z.", - "Ń.Ó.", - "Ń.Ą.", - "Ń.Ć.", - "Ń.Ę.", - "Ń.Ł.", - "Ń.Ń.", - "Ń.Ś.", - "Ń.Ź.", - "Ń.Ż.", - "Ś.A.", - "Ś.B.", - "Ś.C.", - "Ś.D.", - "Ś.E.", - "Ś.F.", - "Ś.G.", - "Ś.H.", - "Ś.I.", - "Ś.J.", - "Ś.K.", - "Ś.L.", - "Ś.M.", - "Ś.N.", - "Ś.O.", - "Ś.P.", - "Ś.R.", - "Ś.S.", - "Ś.T.", - "Ś.U.", - "Ś.W.", - "Ś.Y.", - "Ś.Z.", - "Ś.Ó.", - "Ś.Ą.", - "Ś.Ć.", - "Ś.Ę.", - "Ś.Ł.", - "Ś.Ń.", - "Ś.Ś.", - "Ś.Ź.", - "Ś.Ż.", - "ŚW.", - "Śp.", - "Św.", - "śW.", - "śl.", - "śp.", - "śr.", - "św.", - "Ź.A.", - "Ź.B.", - "Ź.C.", - "Ź.D.", - "Ź.E.", - "Ź.F.", - "Ź.G.", - "Ź.H.", - "Ź.I.", - "Ź.J.", - "Ź.K.", - "Ź.L.", - "Ź.M.", - "Ź.N.", - "Ź.O.", - "Ź.P.", - "Ź.R.", - "Ź.S.", - "Ź.T.", - "Ź.U.", - "Ź.W.", - "Ź.Y.", - "Ź.Z.", - "Ź.Ó.", - "Ź.Ą.", - "Ź.Ć.", - "Ź.Ę.", - "Ź.Ł.", - "Ź.Ń.", - "Ź.Ś.", - "Ź.Ź.", - "Ź.Ż.", - "Ż.A.", - "Ż.B.", - "Ż.C.", - "Ż.D.", - "Ż.E.", - "Ż.F.", - "Ż.G.", - "Ż.H.", - "Ż.I.", - "Ż.J.", - "Ż.K.", - "Ż.L.", - "Ż.M.", - "Ż.N.", - "Ż.O.", - "Ż.P.", - "Ż.R.", - "Ż.S.", - "Ż.T.", - "Ż.U.", - "Ż.W.", - "Ż.Y.", - "Ż.Z.", - "Ż.Ó.", - "Ż.Ą.", - "Ż.Ć.", - "Ż.Ę.", - "Ż.Ł.", - "Ż.Ń.", - "Ż.Ś.", - "Ż.Ź.", - "Ż.Ż.", - "ż.", - "żarg.", - "żart.", - "żyd.", - "żyw.", -] diff --git a/spacy/lang/pl/polish_srx_rules_LICENSE.txt b/spacy/lang/pl/polish_srx_rules_LICENSE.txt deleted file mode 100644 index 995a1b0f7..000000000 --- a/spacy/lang/pl/polish_srx_rules_LICENSE.txt +++ /dev/null @@ -1,23 +0,0 @@ - -Copyright (c) 2019, Marcin Miłkowski -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py index 4e69a3912..aa8adac29 100644 --- a/spacy/lang/pl/punctuation.py +++ b/spacy/lang/pl/punctuation.py @@ -1,22 +1,46 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS +from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES _quotes = CONCAT_QUOTES.replace("'", "") +_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES + _infixes = ( LIST_ELLIPSES - + [CONCAT_ICONS] + + LIST_ICONS + + LIST_HYPHENS + [ - r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes), ] ) +_suffixes = ( + ["''", "’’", r"\.", "…"] + + LIST_PUNCT + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}])\.".format(au=ALPHA_UPPER), + ] +) + + +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py deleted file mode 100644 index 9e4814b0f..000000000 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ /dev/null @@ -1,26 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS -from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ - - -_exc = {} - -for exc_data in [ - {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV}, - {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN}, - {ORTH: "mgr.", LEMMA: "magister", POS: NOUN}, - {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, - {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, - {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}, -]: - _exc[exc_data[ORTH]] = [exc_data] - -for orth in ["w.", "r."]: - _exc[orth] = [{ORTH: orth}] - -for orth in PL_BASE_EXCEPTIONS: - _exc[orth] = [{ORTH: orth}] - -TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py index 9d0034589..9f4f5a38d 100644 --- a/spacy/tests/lang/pl/test_tokenizer.py +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -4,49 +4,15 @@ from __future__ import unicode_literals import pytest DOT_TESTS = [ - ("tel.", ["tel."]), - ("np.", ["np."]), - ("godz. 21:37", ["godz.", "21:37"]), - ("inż.", ["inż."]), - ("gosp.-polit.", ["gosp.-polit."]), - ("ppoż", ["ppoż"]), - ("płn", ["płn"]), - ("ul.", ["ul."]), - ("jw.", ["jw."]), - ("itd.", ["itd."]), - ("cdn.", ["cdn."]), - ("itp.", ["itp."]), - ("10,- zł", ["10,-", "zł"]), + ("tel.", ["tel", "."]), ("0 zł 99 gr", ["0", "zł", "99", "gr"]), - ("0,99 rub.", ["0,99", "rub."]), - ("dol.", ["dol."]), - ("1000 m n.p.m.", ["1000", "m", "n.p.m."]), - ("m.in.", ["m.in."]), - ("p.n.e.", ["p.n.e."]), - ("Sz.P.", ["Sz.P."]), - ("p.o.", ["p.o."]), - ("k.o.", ["k.o."]), - ("m.st.", ["m.st."]), - ("dra.", ["dra", "."]), - ("pp.", ["pp."]), - ("oo.", ["oo."]), ] HYPHEN_TESTS = [ - ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]), - ("NESS-040C5", ["NESS-040C5"]), - ("JTE-7-31", ["JTE-7-31"]), - ("BAY-59-3074", ["BAY-59-3074"]), - ("BAY-38-7271", ["BAY-38-7271"]), - ("STS-135", ["STS-135"]), - ("5F-PB-22", ["5F-PB-22"]), ("cztero-", ["cztero-"]), ("jedno-", ["jedno-"]), ("dwu-", ["dwu-"]), ("trzy-", ["trzy-"]), - ("b-adoratorzy", ["b-adoratorzy"]), - ("2-3-4 drzewa", ["2-3-4", "drzewa"]), - ("b-drzewa", ["b-drzewa"]), ] From 70da1fd2d6e96256ad863f1e625091c46dac4835 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 19 May 2020 16:01:18 +0200 Subject: [PATCH 104/131] Add warning for misaligned character offset spans (#5007) * Add warning for misaligned character offset spans * Resolve conflict * Filter warnings in example scripts Filter warnings in example scripts to show warnings once, in particular warnings about misaligned entities. Co-authored-by: Ines Montani --- examples/training/rehearsal.py | 6 +++++- examples/training/train_ner.py | 9 +++++++-- examples/training/train_new_entity_type.py | 9 +++++++-- spacy/errors.py | 6 +++++- spacy/gold.pyx | 6 ++++++ spacy/tests/test_gold.py | 3 ++- 6 files changed, 32 insertions(+), 7 deletions(-) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 9ece91427..24b1cea00 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -1,6 +1,7 @@ """Prevent catastrophic forgetting with rehearsal updates.""" import plac import random +import warnings import srsly import spacy from spacy.gold import GoldParse @@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc): pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] sizes = compounding(1.0, 4.0, 1.001) - with nlp.disable_pipes(*other_pipes): + with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + # show warnings for misaligned entity spans once + warnings.filterwarnings("once", category=UserWarning, module='spacy') + for itn in range(n_iter): random.shuffle(TRAIN_DATA) random.shuffle(raw_docs) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 01bb6a67b..ff6029567 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -8,12 +8,13 @@ For more details, see the documentation: * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function import plac import random +import warnings from pathlib import Path import spacy from spacy.util import minibatch, compounding @@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100): # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train NER + # only train NER + with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + # show warnings for misaligned entity spans once + warnings.filterwarnings("once", category=UserWarning, module='spacy') + # reset and initialize the weights randomly – but only if we're # training a new model if model is None: diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 72d33ad50..e8ff6802a 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,12 +24,13 @@ For more details, see the documentation: * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.1.0+ -Last tested with: v2.1.0 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function import plac import random +import warnings from pathlib import Path import spacy from spacy.util import minibatch, compounding @@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train NER + # only train NER + with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + # show warnings for misaligned entity spans once + warnings.filterwarnings("once", category=UserWarning, module='spacy') + sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): diff --git a/spacy/errors.py b/spacy/errors.py index d99c96922..1b268d5ab 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -110,7 +110,11 @@ class Warnings(object): "in problems with the vocab further on in the pipeline.") W029 = ("Unable to align tokens with entities from character offsets. " "Discarding entity annotation for the text: {text}.") - + W030 = ("Some entities could not be aligned in the text \"{text}\" with " + "entities \"{entities}\". Use " + "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" + " to check the alignment. Misaligned entities ('-') will be " + "ignored during training.") @add_codes class Errors(object): diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 4b8a4f52d..cf67a2ac7 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): break else: biluo[token.i] = missing + if "-" in biluo: + ent_str = str(entities) + warnings.warn(Warnings.W030.format( + text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text, + entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str + )) return biluo diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index fc9e624eb..37b877561 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab): spaces = [True, True, True, True, True, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + with pytest.warns(UserWarning): + tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"] From 40e65d6f6349a55f20f109eb4fbae91489ec54b0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 19 May 2020 16:41:26 +0200 Subject: [PATCH 105/131] Fix most_similar for vectors with unused rows (#5348) * Fix most_similar for vectors with unused rows Address issues related to the unused rows in the vector table and `most_similar`: * Update `most_similar()` to search only through rows that are in use according to `key2row`. * Raise an error when `most_similar(n=n)` is larger than the number of vectors in the table. * Set and restore `_unset` correctly when vectors are added or deserialized so that new vectors are added in the correct row. * Set data and keys to the same length in `Vocab.prune_vectors()` to avoid spurious entries in `key2row`. * Fix regression test using `most_similar` Co-authored-by: Matthew Honnibal --- spacy/errors.py | 2 + spacy/tests/regression/test_issue3001-3500.py | 2 +- spacy/tests/vocab_vectors/test_vectors.py | 45 ++++++++++++++++--- spacy/vectors.pyx | 23 +++++++--- 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 1b268d5ab..f0b8592df 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -564,6 +564,8 @@ class Errors(object): E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " "only be fixed with token.is_sent_start.") E197 = ("Row out of bounds, unable to add row {row} for key {key}.") + E198 = ("Unable to return {n} most similar vectors for the current vectors " + "table, which contains {n_rows} vectors.") @add_codes diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index d05759c31..effbebb92 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -295,7 +295,7 @@ def test_issue3410(): def test_issue3412(): data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") - vectors = Vectors(data=data) + vectors = Vectors(data=data, keys=["A", "B", "C"]) keys, best_rows, scores = vectors.most_similar( numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f") ) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 16d9801ab..24eb3a1af 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest import numpy -from numpy.testing import assert_allclose +from numpy.testing import assert_allclose, assert_equal from spacy._ml import cosine from spacy.vocab import Vocab from spacy.vectors import Vectors @@ -11,7 +11,7 @@ from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc -from ..util import add_vecs_to_vocab +from ..util import add_vecs_to_vocab, make_tempdir @pytest.fixture @@ -59,6 +59,11 @@ def most_similar_vectors_data(): ) +@pytest.fixture +def most_similar_vectors_keys(): + return ["a", "b", "c", "d"] + + @pytest.fixture def resize_data(): return numpy.asarray([[0.0, 1.0], [2.0, 3.0]], dtype="f") @@ -146,11 +151,14 @@ def test_set_vector(strings, data): assert list(v[strings[0]]) != list(orig[0]) -def test_vectors_most_similar(most_similar_vectors_data): - v = Vectors(data=most_similar_vectors_data) +def test_vectors_most_similar(most_similar_vectors_data, most_similar_vectors_keys): + v = Vectors(data=most_similar_vectors_data, keys=most_similar_vectors_keys) _, best_rows, _ = v.most_similar(v.data, batch_size=2, n=2, sort=True) assert all(row[0] == i for i, row in enumerate(best_rows)) + with pytest.raises(ValueError): + v.most_similar(v.data, batch_size=2, n=10, sort=True) + def test_vectors_most_similar_identical(): """Test that most similar identical vectors are assigned a score of 1.0.""" @@ -331,6 +339,33 @@ def test_vocab_prune_vectors(): assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) +def test_vectors_serialize(): + data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") + v = Vectors(data=data, keys=["A", "B", "C"]) + b = v.to_bytes() + v_r = Vectors() + v_r.from_bytes(b) + assert_equal(v.data, v_r.data) + assert v.key2row == v_r.key2row + v.resize((5, 4)) + v_r.resize((5, 4)) + row = v.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f")) + row_r = v_r.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f")) + assert row == row_r + assert_equal(v.data, v_r.data) + assert v.is_full == v_r.is_full + with make_tempdir() as d: + v.to_disk(d) + v_r.from_disk(d) + assert_equal(v.data, v_r.data) + assert v.key2row == v_r.key2row + v.resize((5, 4)) + v_r.resize((5, 4)) + row = v.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f")) + row_r = v_r.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f")) + assert row == row_r + assert_equal(v.data, v_r.data) + def test_vector_is_oov(): vocab = Vocab(vectors_name="test_vocab_is_oov") data = numpy.ndarray((5, 3), dtype="f") @@ -340,4 +375,4 @@ def test_vector_is_oov(): vocab.set_vector("dog", data[1]) assert vocab["cat"].is_oov is True assert vocab["dog"].is_oov is True - assert vocab["hamster"].is_oov is False + assert vocab["hamster"].is_oov is False \ No newline at end of file diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 2973ddb5b..3da3b01d7 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -212,8 +212,7 @@ cdef class Vectors: copy_shape = (min(shape[0], self.data.shape[0]), min(shape[1], self.data.shape[1])) resized_array[:copy_shape[0], :copy_shape[1]] = self.data[:copy_shape[0], :copy_shape[1]] self.data = resized_array - filled = {row for row in self.key2row.values()} - self._unset = cppset[int]({row for row in range(shape[0]) if row not in filled}) + self._sync_unset() removed_items = [] for key, row in list(self.key2row.items()): if row >= shape[0]: @@ -310,8 +309,8 @@ cdef class Vectors: raise ValueError(Errors.E197.format(row=row, key=key)) if vector is not None: self.data[row] = vector - if self._unset.count(row): - self._unset.erase(self._unset.find(row)) + if self._unset.count(row): + self._unset.erase(self._unset.find(row)) return row def most_similar(self, queries, *, batch_size=1024, n=1, sort=True): @@ -330,11 +329,14 @@ cdef class Vectors: RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)` tuple. """ + filled = sorted(list({row for row in self.key2row.values()})) + if len(filled) < n: + raise ValueError(Errors.E198.format(n=n, n_rows=len(filled))) xp = get_array_module(self.data) - norms = xp.linalg.norm(self.data, axis=1, keepdims=True) + norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True) norms[norms == 0] = 1 - vectors = self.data / norms + vectors = self.data[filled] / norms best_rows = xp.zeros((queries.shape[0], n), dtype='i') scores = xp.zeros((queries.shape[0], n), dtype='f') @@ -356,7 +358,8 @@ cdef class Vectors: scores[i:i+batch_size] = scores[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index] - xp = get_array_module(self.data) + for i, j in numpy.ndindex(best_rows.shape): + best_rows[i, j] = filled[best_rows[i, j]] # Round values really close to 1 or -1 scores = xp.around(scores, decimals=4, out=scores) # Account for numerical error we want to return in range -1, 1 @@ -419,6 +422,7 @@ cdef class Vectors: ("vectors", load_vectors), )) util.from_disk(path, serializers, []) + self._sync_unset() return self def to_bytes(self, **kwargs): @@ -461,4 +465,9 @@ cdef class Vectors: ("vectors", deserialize_weights) )) util.from_bytes(data, deserializers, []) + self._sync_unset() return self + + def _sync_unset(self): + filled = {row for row in self.key2row.values()} + self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled}) From 0a5b140235bb6a8cfdb35bcd5fdd68d14128733c Mon Sep 17 00:00:00 2001 From: Kevin Lu Date: Tue, 19 May 2020 20:12:21 -0700 Subject: [PATCH 106/131] Update universe.json --- website/meta/universe.json | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index cf587f5f0..724dc3d07 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2172,6 +2172,39 @@ "model_uri = f'runs:/{my_run_id}/model'", "nlp2 = mlflow.spacy.load_model(model_uri=model_uri)" ] + }, + { + "id": "pyate", + "title": "PyATE", + "slogan": "Python Automated Term Extraction", + "description": "PyATE is a term extraction library written in Python using Spacy POS tagging with Basic, Combo Basic, C-Value, TermExtractor, and Weirdness.", + "github": "kevinlu1248/pyate", + "pip": "pyate", + "code_example": [ + "from pyate import combo_basic", + "", + "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", + "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", + "", + "print(combo_basic(string).sort_values(ascending=False).head(5))", + "\"\"\"\"\"\"", + "dysfunctional tumor 1.443147", + "tumor suppressors 1.443147", + "genetic changes 1.386294", + "cancer cells 1.386294", + "dysfunctional tumor suppressors 1.298612", + "\"\"\"\"\"\"" + ], + "code_language": "python", + "url": "https://github.com/kevinlu1248/pyate", + "author": "Kevin Lu", + "author_links": { + "twitter": "kevinlu1248", + "github": "kevinlu1248", + "website": "https://github.com/kevinlu1248/pyate" + }, + "category": ["pipeline", "research"], + "tags": ["term_extraction"] } ], From a23b3a5a5042ed99cfd0c9988d1956adb85601c0 Mon Sep 17 00:00:00 2001 From: Kevin Lu Date: Tue, 19 May 2020 20:24:24 -0700 Subject: [PATCH 107/131] Update CONTRIBUTOR_AGREEMENT.md --- .github/CONTRIBUTOR_AGREEMENT.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index da9f244eb..fc974ec95 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [ ] I am signing on behalf of myself as an individual and no other person + * [x] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | | +| Name | Kevin Lu| | Company name (if applicable) | | -| Title or role (if applicable) | | +| Title or role (if applicable) | Student| | Date | | -| GitHub username | | +| GitHub username | kevinlu1248| | Website (optional) | | From 9a1a5352154a58a83278de3be77aa564af05b40f Mon Sep 17 00:00:00 2001 From: Kevin Lu Date: Tue, 19 May 2020 20:25:45 -0700 Subject: [PATCH 108/131] Create kevinlu1248.md --- .github/contributors/kevinlu1248.md | 106 ++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/kevinlu1248.md diff --git a/.github/contributors/kevinlu1248.md b/.github/contributors/kevinlu1248.md new file mode 100644 index 000000000..fc974ec95 --- /dev/null +++ b/.github/contributors/kevinlu1248.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Kevin Lu| +| Company name (if applicable) | | +| Title or role (if applicable) | Student| +| Date | | +| GitHub username | kevinlu1248| +| Website (optional) | | From 291b9ad7b902edd945cc8430550a6633440c582a Mon Sep 17 00:00:00 2001 From: Kevin Lu Date: Tue, 19 May 2020 20:29:53 -0700 Subject: [PATCH 109/131] Update CONTRIBUTOR_AGREEMENT.md --- .github/CONTRIBUTOR_AGREEMENT.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index fc974ec95..da9f244eb 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person + * [ ] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Kevin Lu| +| Name | | | Company name (if applicable) | | -| Title or role (if applicable) | Student| +| Title or role (if applicable) | | | Date | | -| GitHub username | kevinlu1248| +| GitHub username | | | Website (optional) | | From 4fa96705379b10b761a7097b1adb12145402cb1f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 May 2020 09:56:56 +0200 Subject: [PATCH 110/131] Extend lemmatizer rules for all UPOS tags --- spacy/lemmatizer.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 33908eecf..a070574bb 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -6,6 +6,7 @@ from collections import OrderedDict from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups +from .parts_of_speech import NAMES as UPOS_NAMES class Lemmatizer(object): @@ -43,17 +44,11 @@ class Lemmatizer(object): lookup_table = self.lookups.get_table("lemma_lookup", {}) if "lemma_rules" not in self.lookups: return [lookup_table.get(string, string)] - if univ_pos in (NOUN, "NOUN", "noun"): - univ_pos = "noun" - elif univ_pos in (VERB, "VERB", "verb"): - univ_pos = "verb" - elif univ_pos in (ADJ, "ADJ", "adj"): - univ_pos = "adj" - elif univ_pos in (PUNCT, "PUNCT", "punct"): - univ_pos = "punct" - elif univ_pos in (PROPN, "PROPN"): - return [string] - else: + if isinstance(univ_pos, int): + univ_pos = UPOS_NAMES.get(univ_pos, "X") + univ_pos = univ_pos.lower() + + if univ_pos in ("", "eol", "space"): return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): From 78bb9ff5e0e4adc01bd30e227657118d87546f83 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 May 2020 14:56:52 +0200 Subject: [PATCH 111/131] doc_or_span -> obj --- spacy/matcher/matcher.pyx | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 4cfab915f..3d99f117a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -213,28 +213,28 @@ cdef class Matcher: else: yield doc - def __call__(self, object doc_or_span): + def __call__(self, object obj): """Find all token sequences matching the supplied pattern. - doc_or_span (Doc or Span): The document to match over. + obj (Doc / Span): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ - if isinstance(doc_or_span, Doc): - doc = doc_or_span + if isinstance(obj, Doc): + doc = obj length = len(doc) - elif isinstance(doc_or_span, Span): - doc = doc_or_span.doc - length = doc_or_span.end - doc_or_span.start + elif isinstance(obj, Span): + doc = obj.doc + length = obj.end - obj.start else: - raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(obj).__name__)) if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E156.format()) - matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, + matches = find_matches(&self.patterns[0], self.patterns.size(), obj, length, extensions=self._extensions, predicates=self._extra_predicates) for i, (key, start, end) in enumerate(matches): on_match = self._callbacks.get(key, None) @@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, extensions=None, predicates=tuple()): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples. @@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt else: nr_extra_attr = 0 extra_attr_values = mem.alloc(length, sizeof(attr_t)) - for i, token in enumerate(doc_or_span): + for i, token in enumerate(obj): for name, index in extensions.items(): value = token._.get(name) if isinstance(value, basestring): @@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, predicate_cache, - doc_or_span[i], extra_attr_values, predicates) + obj[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns From 9393253b66b5f9fc6c5e58806cf261da5afd1778 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 20 May 2020 15:18:06 +0200 Subject: [PATCH 112/131] Remove peeking from Parser.begin_training (#5456) Inspect all instances in `Parser.begin_training` rather than only the first 1000. --- spacy/syntax/nn_parser.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..fafa492c6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,7 +9,6 @@ import numpy cimport cython.parallel import numpy.random cimport numpy as np -from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -621,15 +620,15 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() - doc_sample = [] - gold_sample = [] - for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): + docs = [] + golds = [] + for raw_text, annots_brackets in get_gold_tuples(): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - doc_sample.append(Doc(self.vocab, words=words)) - gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(doc_sample, gold_sample) + docs.append(Doc(self.vocab, words=words)) + golds.append(GoldParse(docs[-1], words=words, tags=tags, + heads=heads, deps=deps, entities=ents)) + self.model.begin_training(docs, golds) if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) From 8cba0e41d8e2797763110e8dd1b3b2ec8a29e719 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 May 2020 15:35:08 +0200 Subject: [PATCH 113/131] Return lowercase form as default except for PROPN --- spacy/lemmatizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a070574bb..1f0f0da3f 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -56,6 +56,11 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) + if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if univ_pos == "propn": + return [string] + else: + return [string.lower()] lemmas = self.lemmatize( string, index_table.get(univ_pos, {}), From daaa7bf45111cd7d033868f875442b494a9dfead Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 May 2020 15:51:44 +0200 Subject: [PATCH 114/131] Add option to omit extra lexeme tables in CLI --- spacy/cli/init_model.py | 12 ++++++++++++ spacy/cli/train.py | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3311a5120..18589a954 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -18,6 +18,7 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class, OOV_RANK +from ..lookups import Lookups try: import ftfy @@ -49,6 +50,7 @@ DEFAULT_OOV_PROB = -20 str, ), model_name=("Optional name for the model meta", "option", "mn", str), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), ) def init_model( lang, @@ -61,6 +63,7 @@ def init_model( prune_vectors=-1, vectors_name=None, model_name=None, + omit_extra_lookups=False, ): """ Create a new model from raw data, like word frequencies, Brown clusters @@ -93,6 +96,15 @@ def init_model( with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name) + + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7cb2d9745..6ce095c15 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -17,6 +17,7 @@ from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu from ..gold import GoldCorpus from ..compat import path2str +from ..lookups import Lookups from .. import util from .. import about @@ -57,6 +58,7 @@ from .. import about textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), # fmt: on @@ -96,6 +98,7 @@ def train( textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, + omit_extra_lookups=False, verbose=False, debug=False, ): @@ -247,6 +250,14 @@ def train( # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) From c7c4cd5fe13ccae97a4cb9ee211226dfd129a941 Mon Sep 17 00:00:00 2001 From: Kevin Lu Date: Wed, 20 May 2020 09:11:32 -0700 Subject: [PATCH 115/131] Changed pyate code example in universe.json --- website/meta/universe.json | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 724dc3d07..857e26813 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2181,19 +2181,23 @@ "github": "kevinlu1248/pyate", "pip": "pyate", "code_example": [ - "from pyate import combo_basic", - "", - "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", - "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", - "", - "print(combo_basic(string).sort_values(ascending=False).head(5))", - "\"\"\"\"\"\"", - "dysfunctional tumor 1.443147", - "tumor suppressors 1.443147", - "genetic changes 1.386294", - "cancer cells 1.386294", - "dysfunctional tumor suppressors 1.298612", - "\"\"\"\"\"\"" + "import spacy", + "from pyate.term_extraction_pipeline import TermExtractionPipeline", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe(TermExtractionPipeline())", + "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", + "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", + "", + "doc = nlp(string)", + "print(doc._.combo_basic.sort_values(ascending=False).head(5))", + "\"\"\"\"\"\"", + "dysfunctional tumor 1.443147", + "tumor suppressors 1.443147", + "genetic changes 1.386294", + "cancer cells 1.386294", + "dysfunctional tumor suppressors 1.298612", + "\"\"\"\"\"\"" ], "code_language": "python", "url": "https://github.com/kevinlu1248/pyate", From 49ef06d793b885c3bd634ac72f38be067246822a Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 20 May 2020 18:49:11 +0200 Subject: [PATCH 116/131] Add option for base model in init-model CLI (#5467) Intended for languages like Chinese with a custom tokenizer. --- spacy/cli/init_model.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3311a5120..537afd10f 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -17,7 +17,7 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings -from ..util import ensure_path, get_lang_class, OOV_RANK +from ..util import ensure_path, get_lang_class, load_model, OOV_RANK try: import ftfy @@ -49,6 +49,7 @@ DEFAULT_OOV_PROB = -20 str, ), model_name=("Optional name for the model meta", "option", "mn", str), + base_model=("Base model (for languages with custom tokenizers)", "option", "b", str), ) def init_model( lang, @@ -61,6 +62,7 @@ def init_model( prune_vectors=-1, vectors_name=None, model_name=None, + base_model=None, ): """ Create a new model from raw data, like word frequencies, Brown clusters @@ -92,7 +94,7 @@ def init_model( lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): - nlp = create_model(lang, lex_attrs, name=model_name) + nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) @@ -152,9 +154,16 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc): return lex_attrs -def create_model(lang, lex_attrs, name=None): - lang_class = get_lang_class(lang) - nlp = lang_class() +def create_model(lang, lex_attrs, name=None, base_model=None): + if base_model: + nlp = load_model(base_model) + # keep the tokenizer but remove any existing pipeline components due to + # potentially conflicting vectors + for pipe in nlp.pipe_names: + nlp.remove_pipe(pipe) + else: + lang_class = get_lang_class(lang) + nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = OOV_RANK for attrs in lex_attrs: From 36a94c409a50e3d815924197d668e0ae315d4352 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 20 May 2020 23:06:03 +0200 Subject: [PATCH 117/131] failing test to reproduce overlapping spans problem --- spacy/tests/regression/test_issue5458.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 spacy/tests/regression/test_issue5458.py diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py new file mode 100644 index 000000000..33281c858 --- /dev/null +++ b/spacy/tests/regression/test_issue5458.py @@ -0,0 +1,21 @@ +from spacy.lang.en import English +from spacy.lang.en.syntax_iterators import noun_chunks +from spacy.tests.util import get_doc +from spacy.vocab import Vocab + + +def test_issue5458(): + # Test that the noun chuncker does not generate overlapping spans + words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] + vocab = Vocab(strings=words) + dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] + pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] + heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + + en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) + en_doc.noun_chunks_iterator = noun_chunks + + # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" + nlp = English() + merge_nps = nlp.create_pipe("merge_noun_chunks") + merge_nps(en_doc) From b509a3e7fcadf84c257c1e5168b6dc926b8b2f3d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 20 May 2020 23:06:39 +0200 Subject: [PATCH 118/131] fix: use actual range in 'seen' instead of subtree --- spacy/lang/en/syntax_iterators.py | 4 ++-- spacy/language.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 5ff848124..22f7fcf81 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + if any(j in seen for j in range(word.left_edge.i, word.i + 1)): continue seen.update(j for j in range(word.left_edge.i, word.i + 1)) yield word.left_edge.i, word.i + 1, np_label @@ -46,7 +46,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + if any(j in seen for j in range(word.left_edge.i, word.i + 1)): continue seen.update(j for j in range(word.left_edge.i, word.i + 1)) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/language.py b/spacy/language.py index 703806627..c4eb26bad 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -418,7 +418,7 @@ class Language(object): def __call__(self, text, disable=[], component_cfg=None): """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string + and can contain arbitrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. From b221bcf1ba3907552d4c3b660d1902b0a1c26b2e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 May 2020 00:17:28 +0200 Subject: [PATCH 119/131] fixing all languages --- spacy/lang/el/syntax_iterators.py | 14 +++++++------- spacy/lang/en/syntax_iterators.py | 10 ++++++---- spacy/lang/fa/syntax_iterators.py | 10 ++++++---- spacy/lang/fr/syntax_iterators.py | 10 ++++++---- spacy/lang/id/syntax_iterators.py | 10 ++++++---- spacy/lang/nb/syntax_iterators.py | 10 ++++++---- spacy/lang/sv/syntax_iterators.py | 10 ++++++---- 7 files changed, 43 insertions(+), 31 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index f02619ac9..5d6398aad 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -31,16 +31,15 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue flag = False if word.pos == NOUN: # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - seen.update( - j for j in range(word.left_edge.i, potential_nmod.i + 1) - ) + w_range = range(word.left_edge.i, potential_nmod.i + 1) + if any(j in seen for j in w_range): + continue + seen.update(j for j in w_range) yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break @@ -54,9 +53,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 22f7fcf81..0d43ebf37 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(j in seen for j in range(word.left_edge.i, word.i + 1)): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(j in seen for j in range(word.left_edge.i, word.i + 1)): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 5ff848124..0d43ebf37 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 148884efe..31e3302e9 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label From 56de520afd2276e80f634ceb01e8c5a51ea64bb5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:04:57 +0200 Subject: [PATCH 120/131] Try to fix tests on Travis (2.7) --- spacy/lang/hy/examples.py | 1 + spacy/lang/hy/lex_attrs.py | 1 + spacy/lang/hy/stop_words.py | 3 ++- spacy/lang/zh/__init__.py | 36 ++++++++++++++------------------ spacy/tests/lang/hy/test_text.py | 1 + 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index b0df31aae..d04204c55 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index 7c1b9592f..910625fb8 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals from ...attrs import LIKE_NUM diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index c671956a4..3f2f7bb15 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals @@ -105,6 +106,6 @@ STOP_WORDS = set( յուրաքանչյուր այս մեջ -թ +թ """.split() ) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ed0b3eb74..508c5a03f 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer): if reset: try: import pkuseg + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) except ImportError: if self.use_pkuseg: @@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer): ) raise ImportError(msg) for word in words: - self.pkuseg_seg.preprocesser.insert(word.strip(), '') + self.pkuseg_seg.preprocesser.insert(word.strip(), "") def _get_config(self): config = OrderedDict( @@ -168,21 +169,19 @@ class ChineseTokenizer(DummyTokenizer): return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): - pkuseg_features_b = b"" - pkuseg_weights_b = b"" - pkuseg_processors_data = None + data = {"features_b": b"", "weights_b": b"", "processors_data": None} + # pkuseg_features_b = b"" + # pkuseg_weights_b = b"" + # pkuseg_processors_data = None def deserialize_pkuseg_features(b): - nonlocal pkuseg_features_b - pkuseg_features_b = b + data["features_b"] = b def deserialize_pkuseg_weights(b): - nonlocal pkuseg_weights_b - pkuseg_weights_b = b + data["weights_b"] = b def deserialize_pkuseg_processors(b): - nonlocal pkuseg_processors_data - pkuseg_processors_data = srsly.msgpack_loads(b) + data["processors_data"] = srsly.msgpack_loads(b) deserializers = OrderedDict( ( @@ -194,13 +193,13 @@ class ChineseTokenizer(DummyTokenizer): ) util.from_bytes(data, deserializers, []) - if pkuseg_features_b and pkuseg_weights_b: + if data["features_b"] and data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: - fileh.write(pkuseg_features_b) + fileh.write(data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: - fileh.write(pkuseg_weights_b) + fileh.write(data["weights_b"]) try: import pkuseg except ImportError: @@ -209,13 +208,10 @@ class ChineseTokenizer(DummyTokenizer): + _PKUSEG_INSTALL_MSG ) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) - if pkuseg_processors_data: - ( - user_dict, - do_process, - common_words, - other_words, - ) = pkuseg_processors_data + if data["processors_data"]: + (user_dict, do_process, common_words, other_words) = data[ + "processors_data" + ] self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py index 6b785bdfc..cbdb77e4e 100644 --- a/spacy/tests/lang/hy/test_text.py +++ b/spacy/tests/lang/hy/test_text.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals import pytest From d8f3190c0a265033ca367097e00cbf085b34615a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:14:01 +0200 Subject: [PATCH 121/131] Tidy up and auto-format --- spacy/cli/debug_data.py | 11 ++++++++--- spacy/cli/init_model.py | 7 ++++++- spacy/errors.py | 3 ++- spacy/lang/da/__init__.py | 1 - spacy/lang/de/stop_words.py | 2 +- spacy/lang/en/tokenizer_exceptions.py | 2 +- spacy/lang/es/punctuation.py | 1 - spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/gu/stop_words.py | 14 +++++++------- spacy/lang/hy/__init__.py | 5 +++-- spacy/lang/hy/examples.py | 2 +- spacy/lang/hy/lex_attrs.py | 1 + spacy/lang/hy/stop_words.py | 4 ++-- spacy/lang/hy/tag_map.py | 12 ++++++------ spacy/lang/ml/lex_attrs.py | 2 +- spacy/lang/ml/stop_words.py | 1 - spacy/lang/pl/__init__.py | 2 +- spacy/lang/pl/lemmatizer.py | 1 - spacy/lang/pl/punctuation.py | 4 +++- spacy/lang/sv/lex_attrs.py | 2 +- spacy/lang/ur/tag_map.py | 1 - spacy/lang/zh/__init__.py | 3 ++- spacy/language.py | 8 ++++++-- spacy/tests/conftest.py | 9 +++++++-- spacy/tests/doc/test_creation.py | 12 +++++++++--- spacy/tests/doc/test_token_api.py | 2 ++ spacy/tests/lang/de/test_noun_chunks.py | 4 ++-- spacy/tests/lang/el/test_noun_chunks.py | 4 ++-- spacy/tests/lang/en/test_noun_chunks.py | 4 ++-- spacy/tests/lang/es/test_noun_chunks.py | 4 ++-- spacy/tests/lang/es/test_text.py | 2 +- spacy/tests/lang/fr/test_noun_chunks.py | 4 ++-- spacy/tests/lang/gu/test_text.py | 7 +++---- spacy/tests/lang/id/test_noun_chunks.py | 4 ++-- spacy/tests/lang/ml/test_text.py | 11 ++++++++++- spacy/tests/lang/nb/test_noun_chunks.py | 4 ++-- spacy/tests/lang/sv/test_noun_chunks.py | 4 ++-- spacy/tests/lang/zh/test_serialize.py | 12 +++++++++++- spacy/tests/lang/zh/test_tokenizer.py | 8 ++++++-- spacy/tests/matcher/test_matcher_api.py | 6 +++--- spacy/tests/pipeline/test_sentencizer.py | 4 +++- .../serialize/test_serialize_vocab_strings.py | 14 +++++++++----- spacy/tests/test_gold.py | 4 ++-- spacy/tests/vocab_vectors/test_vectors.py | 3 ++- spacy/util.py | 2 +- 45 files changed, 138 insertions(+), 81 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 279f34f16..7a4a093e2 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -187,12 +187,17 @@ def debug_data( n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:0.2f}%)".format( - n_missing_vectors, - n_missing_vectors / gold_train_data["n_words"], + n_missing_vectors, n_missing_vectors / gold_train_data["n_words"], ), ) msg.text( - "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose, + "10 most common words without vectors: {}".format( + _format_labels( + gold_train_data["words_missing_vectors"].most_common(10), + counts=True, + ) + ), + show=verbose, ) else: msg.info("No word vectors present in the model") diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 537afd10f..edbd5dff7 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20 str, ), model_name=("Optional name for the model meta", "option", "mn", str), - base_model=("Base model (for languages with custom tokenizers)", "option", "b", str), + base_model=( + "Base model (for languages with custom tokenizers)", + "option", + "b", + str, + ), ) def init_model( lang, diff --git a/spacy/errors.py b/spacy/errors.py index f0b8592df..0750ab616 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -8,7 +8,7 @@ def add_codes(err_cls): class ErrorsWithCodes(err_cls): def __getattribute__(self, code): msg = super().__getattribute__(code) - if code.startswith('__'): # python system attributes like __class__ + if code.startswith("__"): # python system attributes like __class__ return msg else: return "[{code}] {msg}".format(code=code, msg=msg) @@ -116,6 +116,7 @@ class Warnings(object): " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") + @add_codes class Errors(object): E001 = ("No component '{name}' found in pipeline. Available names: {opts}") diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 92eec44b2..0190656e5 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ..norm_exceptions import BASE_NORMS from ...language import Language from ...attrs import LANG from ...util import update_exc diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index 69134124f..0c8b375e0 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -47,7 +47,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz lang lange leicht leider lieber los machen macht machte mag magst man manche manchem manchen mancher manches mehr -mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten +mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst musste mussten na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 62de81912..6a553052b 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: _exc[orth + "d"] = [ {ORTH: orth, LEMMA: word, NORM: word}, - {ORTH: "d", NORM: "'d"} + {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py index 42335237c..f989221c2 100644 --- a/spacy/lang/es/punctuation.py +++ b/spacy/lang/es/punctuation.py @@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA from ..char_classes import merge_chars -from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES _list_units = [u for u in LIST_UNITS if u != "%"] diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index cb1702300..4eb4c1568 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) + "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index f641b5720..85d33763d 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals STOP_WORDS = set( """ -એમ +એમ આ એ રહી @@ -24,7 +24,7 @@ STOP_WORDS = set( તેમને તેમના તેમણે -તેમનું +તેમનું તેમાં અને અહીં @@ -33,12 +33,12 @@ STOP_WORDS = set( થાય જે ને -કે +કે ના ની નો ને -નું +નું શું માં પણ @@ -69,12 +69,12 @@ STOP_WORDS = set( કોઈ કેમ કર્યો -કર્યુ +કર્યુ કરે સૌથી -ત્યારબાદ +ત્યારબાદ તથા -દ્વારા +દ્વારા જુઓ જાઓ જ્યારે diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 3320edb6c..6aaa965bb 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,11 +1,12 @@ +# coding: utf8 +from __future__ import unicode_literals + from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP - from ...attrs import LANG from ...language import Language -from ...tokens import Doc class ArmenianDefaults(Language.Defaults): diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index b0df31aae..323f77b1c 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,6 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.hy.examples import sentences diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index 7c1b9592f..910625fb8 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals from ...attrs import LIKE_NUM diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index c671956a4..d75aad6e2 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,6 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals - STOP_WORDS = set( """ նա @@ -105,6 +105,6 @@ STOP_WORDS = set( յուրաքանչյուր այս մեջ -թ +թ """.split() ) diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py index 90690c22e..722270110 100644 --- a/spacy/lang/hy/tag_map.py +++ b/spacy/lang/hy/tag_map.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN +from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ TAG_MAP = { @@ -716,7 +716,7 @@ TAG_MAP = { POS: NOUN, "Animacy": "Nhum", "Case": "Dat", - "Number": "Coll", + # "Number": "Coll", "Number": "Sing", "Person": "1", }, @@ -815,7 +815,7 @@ TAG_MAP = { "Animacy": "Nhum", "Case": "Nom", "Definite": "Def", - "Number": "Plur", + # "Number": "Plur", "Number": "Sing", "Poss": "Yes", }, @@ -880,7 +880,7 @@ TAG_MAP = { POS: NOUN, "Animacy": "Nhum", "Case": "Nom", - "Number": "Plur", + # "Number": "Plur", "Number": "Sing", "Person": "2", }, @@ -1223,9 +1223,9 @@ TAG_MAP = { "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": { POS: PRON, "Case": "Nom", - "Number": "Sing", + # "Number": "Sing", "Number": "Plur", - "Person": "3", + # "Person": "3", "Person": "1", "PronType": "Emp", }, diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py index 345da8126..468ad88f8 100644 --- a/spacy/lang/ml/lex_attrs.py +++ b/spacy/lang/ml/lex_attrs.py @@ -55,7 +55,7 @@ _num_words = [ "തൊണ്ണൂറ് ", "നുറ് ", "ആയിരം ", - "പത്തുലക്ഷം" + "പത്തുലക്ഷം", ] diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 4012571bc..8bd6a7e02 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals STOP_WORDS = set( - """ അത് ഇത് diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 61608a3d9..52b662a90 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...util import add_lookups from ...lookups import Lookups diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 2be4b0fb7..cd555b9c2 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from ...lemmatizer import Lemmatizer from ...parts_of_speech import NAMES -from ...errors import Errors class PolishLemmatizer(Lemmatizer): diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py index aa8adac29..c87464b1b 100644 --- a/spacy/lang/pl/punctuation.py +++ b/spacy/lang/pl/punctuation.py @@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES _quotes = CONCAT_QUOTES.replace("'", "") -_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES +_prefixes = _prefixes = [ + r"(długo|krótko|jedno|dwu|trzy|cztero)-" +] + BASE_TOKENIZER_PREFIXES _infixes = ( LIST_ELLIPSES diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py index 4b5278c7b..24d06a97a 100644 --- a/spacy/lang/sv/lex_attrs.py +++ b/spacy/lang/sv/lex_attrs.py @@ -40,7 +40,7 @@ _num_words = [ "miljard", "biljon", "biljard", - "kvadriljon" + "kvadriljon", ] diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index eebd3a14a..aad548e9b 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -38,7 +38,6 @@ TAG_MAP = { "NNPC": {POS: PROPN}, "NNC": {POS: NOUN}, "PSP": {POS: ADP}, - ".": {POS: PUNCT}, ",": {POS: PUNCT}, "-LRB-": {POS: PUNCT}, diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ed0b3eb74..a877169a2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer): if reset: try: import pkuseg + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) except ImportError: if self.use_pkuseg: @@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer): ) raise ImportError(msg) for word in words: - self.pkuseg_seg.preprocesser.insert(word.strip(), '') + self.pkuseg_seg.preprocesser.insert(word.strip(), "") def _get_config(self): config = OrderedDict( diff --git a/spacy/language.py b/spacy/language.py index 703806627..0e5c46459 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -79,7 +79,9 @@ class BaseDefaults(object): lookups=lookups, ) vocab.lex_attr_getters[NORM] = util.add_lookups( - vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm") + vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + BASE_NORMS, + vocab.lookups.get_table("lexeme_norm"), ) for tag_str, exc in cls.morph_rules.items(): for orth_str, attrs in exc.items(): @@ -974,7 +976,9 @@ class Language(object): serializers = OrderedDict() serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) - serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items()))) + serializers["meta.json"] = lambda: srsly.json_dumps( + OrderedDict(sorted(self.meta.items())) + ) for name, proc in self.pipeline: if name in exclude: continue diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index d26f0ce5c..63bbf2e0a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -112,6 +112,7 @@ def ga_tokenizer(): def gu_tokenizer(): return get_lang_class("gu").Defaults.create_tokenizer() + @pytest.fixture(scope="session") def he_tokenizer(): return get_lang_class("he").Defaults.create_tokenizer() @@ -246,7 +247,9 @@ def yo_tokenizer(): @pytest.fixture(scope="session") def zh_tokenizer_char(): - return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False}) + return get_lang_class("zh").Defaults.create_tokenizer( + config={"use_jieba": False, "use_pkuseg": False} + ) @pytest.fixture(scope="session") @@ -258,7 +261,9 @@ def zh_tokenizer_jieba(): @pytest.fixture(scope="session") def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") - return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}) + return get_lang_class("zh").Defaults.create_tokenizer( + config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True} + ) @pytest.fixture(scope="session") diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 8f543e86a..863a7c210 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab): assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] assert doc.text == text - assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + assert [t.text for t in doc if not t.text.isspace()] == [ + word for word in words if not word.isspace() + ] # partial whitespace in words words = [" ", "'", "dogs", "'", "\n\n", "run", " "] @@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab): assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] assert doc.text == text - assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + assert [t.text for t in doc if not t.text.isspace()] == [ + word for word in words if not word.isspace() + ] # non-standard whitespace tokens words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] @@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab): assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] assert doc.text == text - assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + assert [t.text for t in doc if not t.text.isspace()] == [ + word for word in words if not word.isspace() + ] # mismatch between words and text with pytest.raises(ValueError): diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 1c2253dfa..4dcd07ad9 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer): doc.is_parsed = True assert len(list(doc.sents)) == 2 + def test_is_sent_end(en_tokenizer): doc = en_tokenizer("This is a sentence. This is another.") assert doc[4].is_sent_end is None @@ -213,6 +214,7 @@ def test_token0_has_sent_start_true(): assert doc[1].is_sent_start is None assert not doc.is_sentenced + def test_tokenlast_has_sent_end_true(): doc = Doc(Vocab(), words=["hello", "world"]) assert doc[0].is_sent_end is None diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py index 12ece84b5..8d76ddd79 100644 --- a/spacy/tests/lang/de/test_noun_chunks.py +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -5,9 +5,9 @@ import pytest def test_noun_chunks_is_parsed_de(de_tokenizer): - """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = de_tokenizer("Er lag auf seinem") diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py index be14acc81..4f24865d0 100644 --- a/spacy/tests/lang/el/test_noun_chunks.py +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -5,9 +5,9 @@ import pytest def test_noun_chunks_is_parsed_el(el_tokenizer): - """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 1109af150..ff67986a5 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -13,9 +13,9 @@ from ...util import get_doc def test_noun_chunks_is_parsed(en_tokenizer): - """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = en_tokenizer("This is a sentence") diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index 71069d313..66bbd8c3a 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -5,9 +5,9 @@ import pytest def test_noun_chunks_is_parsed_es(es_tokenizer): - """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = es_tokenizer("en Oxford este verano") diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index e237f922d..999e788dd 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -62,4 +62,4 @@ def test_lex_attrs_like_number(es_tokenizer, text, match): @pytest.mark.parametrize("word", ["once"]) def test_es_lex_attrs_capitals(word): assert like_num(word) - assert like_num(word.upper()) \ No newline at end of file + assert like_num(word.upper()) diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 876bc0ea4..ea93a5a35 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -5,9 +5,9 @@ import pytest def test_noun_chunks_is_parsed_fr(fr_tokenizer): - """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = fr_tokenizer("trouver des travaux antérieurs") diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py index 9f3ae45a4..aa8d442a2 100644 --- a/spacy/tests/lang/gu/test_text.py +++ b/spacy/tests/lang/gu/test_text.py @@ -3,17 +3,16 @@ from __future__ import unicode_literals import pytest + def test_gu_tokenizer_handlers_long_text(gu_tokenizer): text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે""" tokens = gu_tokenizer(text) assert len(tokens) == 9 + @pytest.mark.parametrize( "text,length", - [ - ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), - ("ખેતરની ખેડ કરવામાં આવે છે.", 5), - ], + [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)], ) def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length): tokens = gu_tokenizer(text) diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py index 7bac808b3..add76f9b9 100644 --- a/spacy/tests/lang/id/test_noun_chunks.py +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -5,9 +5,9 @@ import pytest def test_noun_chunks_is_parsed_id(id_tokenizer): - """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = id_tokenizer("sebelas") diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py index 92eca6b21..2883cf5bb 100644 --- a/spacy/tests/lang/ml/test_text.py +++ b/spacy/tests/lang/ml/test_text.py @@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)]) +@pytest.mark.parametrize( + "text,length", + [ + ( + "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", + 10, + ), + ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5), + ], +) def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length): tokens = ml_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py index 17ec6cfda..653491a64 100644 --- a/spacy/tests/lang/nb/test_noun_chunks.py +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -5,9 +5,9 @@ import pytest def test_noun_chunks_is_parsed_nb(nb_tokenizer): - """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = nb_tokenizer("Smørsausen brukes bl.a. til") diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index 38086c255..a6283b65e 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -7,9 +7,9 @@ from ...util import get_doc def test_noun_chunks_is_parsed_sv(sv_tokenizer): - """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. + """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' + with a new Vocab here and forcing is_parsed to 'False' to make sure the noun chunks don't run. """ doc = sv_tokenizer("Studenten läste den bästa boken") diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 58133a88e..56f092ed8 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg): @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): - nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}}) + nlp = Chinese( + meta={ + "tokenizer": { + "config": { + "use_jieba": False, + "use_pkuseg": True, + "pkuseg_model": "medicine", + } + } + } + ) zh_tokenizer_serialize(nlp.tokenizer) diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 035798aa1..28240b6a9 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens): def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg): user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"]) - updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + updated_user_dict = _get_pkuseg_trie_data( + zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie + ) assert len(user_dict) == len(updated_user_dict) - 1 # reset user dict zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True) - reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + reset_user_dict = _get_pkuseg_trie_data( + zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie + ) assert len(reset_user_dict) == 0 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 0295ada82..1112195da 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -265,15 +265,15 @@ def test_matcher_regex_shape(en_vocab): @pytest.mark.parametrize( - "cmp, bad", + "cmp, bad", [ ("==", ["a", "aaa"]), ("!=", ["aa"]), (">=", ["a"]), ("<=", ["aaa"]), (">", ["a", "aa"]), - ("<", ["aa", "aaa"]) - ] + ("<", ["aa", "aaa"]), + ], ) def test_matcher_compare_length(en_vocab, cmp, bad): matcher = Matcher(en_vocab) diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 7e58b3e98..ee9220a29 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): ), ], ) -def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents): +def test_sentencizer_custom_punct( + en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents +): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=punct_chars) doc = sentencizer(doc) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 63faf44fc..3be0a75b3 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -37,7 +37,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP + assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"]) @@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): assert strings1 == [s for s in vocab1_d.strings if s != "_SP"] assert strings2 == [s for s in vocab2_d.strings if s != "_SP"] if strings1 == strings2: - assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"] + assert [s for s in vocab1_d.strings if s != "_SP"] == [ + s for s in vocab2_d.strings if s != "_SP" + ] else: - assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"] + assert [s for s in vocab1_d.strings if s != "_SP"] != [ + s for s in vocab2_d.strings if s != "_SP" + ] @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -76,9 +80,8 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) - length = len(vocab) vocab.from_bytes(vocab.to_bytes()) - assert len(vocab.strings) == len(strings) + 1 # adds _SP + assert len(vocab.strings) == len(strings) + 1 # adds _SP @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): else: assert list(sstore1_d) != list(sstore2_d) + @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 37b877561..53665d852 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): data = ( "I'll return the ₹54 amount", { - "words": ["I", "'ll", "return", "the", "₹", "54", "amount",], + "words": ["I", "'ll", "return", "the", "₹", "54", "amount"], "entities": [(16, 19, "MONEY")], }, ) @@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): data = ( "I'll return the $54 amount", { - "words": ["I", "'ll", "return", "the", "$", "54", "amount",], + "words": ["I", "'ll", "return", "the", "$", "54", "amount"], "entities": [(16, 19, "MONEY")], }, ) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 24eb3a1af..1821f8abc 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -366,6 +366,7 @@ def test_vectors_serialize(): assert row == row_r assert_equal(v.data, v_r.data) + def test_vector_is_oov(): vocab = Vocab(vectors_name="test_vocab_is_oov") data = numpy.ndarray((5, 3), dtype="f") @@ -375,4 +376,4 @@ def test_vector_is_oov(): vocab.set_vector("dog", data[1]) assert vocab["cat"].is_oov is True assert vocab["dog"].is_oov is True - assert vocab["hamster"].is_oov is False \ No newline at end of file + assert vocab["hamster"].is_oov is False diff --git a/spacy/util.py b/spacy/util.py index d4cdca4e0..419c99bc0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -774,7 +774,7 @@ def get_words_and_spaces(words, text): except ValueError: raise ValueError(Errors.E194.format(text=text, words=words)) if word_start > 0: - text_words.append(text[text_pos:text_pos+word_start]) + text_words.append(text[text_pos : text_pos + word_start]) text_spaces.append(False) text_pos += word_start text_words.append(word) From 69fb4bedf20384b475779ee58521e7aa94cf4852 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:14:28 +0200 Subject: [PATCH 122/131] Revert "doc_or_span -> obj" This reverts commit 78bb9ff5e0e4adc01bd30e227657118d87546f83. --- spacy/matcher/matcher.pyx | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 3d99f117a..4cfab915f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -213,28 +213,28 @@ cdef class Matcher: else: yield doc - def __call__(self, object obj): + def __call__(self, object doc_or_span): """Find all token sequences matching the supplied pattern. - obj (Doc / Span): The document to match over. + doc_or_span (Doc or Span): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ - if isinstance(obj, Doc): - doc = obj + if isinstance(doc_or_span, Doc): + doc = doc_or_span length = len(doc) - elif isinstance(obj, Span): - doc = obj.doc - length = obj.end - obj.start + elif isinstance(doc_or_span, Span): + doc = doc_or_span.doc + length = doc_or_span.end - doc_or_span.start else: - raise ValueError(Errors.E195.format(good="Doc or Span", got=type(obj).__name__)) + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E156.format()) - matches = find_matches(&self.patterns[0], self.patterns.size(), obj, length, + matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, extensions=self._extensions, predicates=self._extra_predicates) for i, (key, start, end) in enumerate(matches): on_match = self._callbacks.get(key, None) @@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, extensions=None, predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples. @@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, exten else: nr_extra_attr = 0 extra_attr_values = mem.alloc(length, sizeof(attr_t)) - for i, token in enumerate(obj): + for i, token in enumerate(doc_or_span): for name, index in extensions.items(): value = token._.get(name) if isinstance(value, basestring): @@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, exten for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, predicate_cache, - obj[i], extra_attr_values, predicates) + doc_or_span[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns From b1f45c9da3631d7d18002b8a939cccc6c24dd90b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:19:58 +0200 Subject: [PATCH 123/131] obj -> doclike --- spacy/lang/de/syntax_iterators.py | 6 +++--- spacy/lang/el/syntax_iterators.py | 6 +++--- spacy/lang/en/syntax_iterators.py | 6 +++--- spacy/lang/es/syntax_iterators.py | 4 ++-- spacy/lang/fa/syntax_iterators.py | 6 +++--- spacy/lang/fr/syntax_iterators.py | 6 +++--- spacy/lang/id/syntax_iterators.py | 6 +++--- spacy/lang/nb/syntax_iterators.py | 6 +++--- spacy/lang/sv/syntax_iterators.py | 6 +++--- spacy/matcher/matcher.pyx | 24 ++++++++++++------------ 10 files changed, 38 insertions(+), 38 deletions(-) diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 13bb857ca..73c1b1a6e 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -28,7 +28,7 @@ def noun_chunks(obj): "og", "app", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -38,7 +38,7 @@ def noun_chunks(obj): close_app = doc.vocab.strings.add("nk") rbracket = 0 - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if i < rbracket: continue if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index f02619ac9..4317bdeb4 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases. Works on both Doc and Span. """ @@ -14,7 +14,7 @@ def noun_chunks(obj): # obj tag corrects some DEP tagger mistakes. # Further improvement of the models will eliminate the need for this tag. labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -24,7 +24,7 @@ def noun_chunks(obj): nmod = doc.vocab.strings.add("nmod") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 5ff848124..6d366ec90 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -20,7 +20,7 @@ def noun_chunks(obj): "attr", "ROOT", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -29,7 +29,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 0badddca1..d403183ff 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -5,8 +5,8 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...errors import Errors -def noun_chunks(obj): - doc = obj.doc +def noun_chunks(doclike): + doc = doclike.doc if not doc.is_parsed: raise ValueError(Errors.E029) diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 5ff848124..6d366ec90 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -20,7 +20,7 @@ def noun_chunks(obj): "attr", "ROOT", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -29,7 +29,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 9495dcf1e..2ed2c1b35 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -19,7 +19,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -28,7 +28,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 9495dcf1e..2ed2c1b35 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -19,7 +19,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -28,7 +28,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 9495dcf1e..2ed2c1b35 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -19,7 +19,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -28,7 +28,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 148884efe..84493ae79 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -20,7 +20,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -29,7 +29,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 4cfab915f..0c1a56187 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -213,28 +213,28 @@ cdef class Matcher: else: yield doc - def __call__(self, object doc_or_span): + def __call__(self, object doclike): """Find all token sequences matching the supplied pattern. - doc_or_span (Doc or Span): The document to match over. + doclike (Doc or Span): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ - if isinstance(doc_or_span, Doc): - doc = doc_or_span + if isinstance(doclike, Doc): + doc = doclike length = len(doc) - elif isinstance(doc_or_span, Span): - doc = doc_or_span.doc - length = doc_or_span.end - doc_or_span.start + elif isinstance(doclike, Span): + doc = doclike.doc + length = doclike.end - doclike.start else: - raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E156.format()) - matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, + matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) for i, (key, start, end) in enumerate(matches): on_match = self._callbacks.get(key, None) @@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples. @@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt else: nr_extra_attr = 0 extra_attr_values = mem.alloc(length, sizeof(attr_t)) - for i, token in enumerate(doc_or_span): + for i, token in enumerate(doclike): for name, index in extensions.items(): value = token._.get(name) if isinstance(value, basestring): @@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, predicate_cache, - doc_or_span[i], extra_attr_values, predicates) + doclike[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns From e2fe83e35d21afed9e12e9810921228b551e628a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:20:29 +0200 Subject: [PATCH 124/131] Refer to correct object --- spacy/lang/es/syntax_iterators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index d403183ff..5fda35211 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -21,7 +21,7 @@ def noun_chunks(doclike): np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] token = doc[0] - while token and token.i < len(doc): + while token and token.i < len(doclike): if token.pos in [PROPN, NOUN, PRON]: left, right = noun_bounds( doc, token, np_left_deps, np_right_deps, stop_deps From bea863acd255407887806d1089c1f63896cdf084 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:24:38 +0200 Subject: [PATCH 125/131] Fix naming conflict and formatting --- spacy/lang/zh/__init__.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 508c5a03f..9d1cb71a7 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -169,19 +169,16 @@ class ChineseTokenizer(DummyTokenizer): return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): - data = {"features_b": b"", "weights_b": b"", "processors_data": None} - # pkuseg_features_b = b"" - # pkuseg_weights_b = b"" - # pkuseg_processors_data = None + pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None} def deserialize_pkuseg_features(b): - data["features_b"] = b + pkuseg_data["features_b"] = b def deserialize_pkuseg_weights(b): - data["weights_b"] = b + pkuseg_data["weights_b"] = b def deserialize_pkuseg_processors(b): - data["processors_data"] = srsly.msgpack_loads(b) + pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = OrderedDict( ( @@ -193,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer): ) util.from_bytes(data, deserializers, []) - if data["features_b"] and data["weights_b"]: + if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: - fileh.write(data["features_b"]) + fileh.write(pkuseg_data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: - fileh.write(data["weights_b"]) + fileh.write(pkuseg_data["weights_b"]) try: import pkuseg except ImportError: @@ -208,10 +205,9 @@ class ChineseTokenizer(DummyTokenizer): + _PKUSEG_INSTALL_MSG ) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) - if data["processors_data"]: - (user_dict, do_process, common_words, other_words) = data[ - "processors_data" - ] + if pkuseg_data["processors_data"]: + processors_data = pkuseg_data["processors_data"] + (user_dict, do_process, common_words, other_words) = processors_data self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) From a9cb2882cb98674614e72232c4bc5133b92fa501 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 15:17:39 +0200 Subject: [PATCH 126/131] Rename argument: doc_or_span/obj -> doclike (#5463) * doc_or_span -> obj * Revert "doc_or_span -> obj" This reverts commit 78bb9ff5e0e4adc01bd30e227657118d87546f83. * obj -> doclike * Refer to correct object --- spacy/lang/de/syntax_iterators.py | 6 +++--- spacy/lang/el/syntax_iterators.py | 6 +++--- spacy/lang/en/syntax_iterators.py | 6 +++--- spacy/lang/es/syntax_iterators.py | 6 +++--- spacy/lang/fa/syntax_iterators.py | 6 +++--- spacy/lang/fr/syntax_iterators.py | 6 +++--- spacy/lang/id/syntax_iterators.py | 6 +++--- spacy/lang/nb/syntax_iterators.py | 6 +++--- spacy/lang/sv/syntax_iterators.py | 6 +++--- spacy/matcher/matcher.pyx | 24 ++++++++++++------------ 10 files changed, 39 insertions(+), 39 deletions(-) diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 13bb857ca..73c1b1a6e 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -28,7 +28,7 @@ def noun_chunks(obj): "og", "app", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -38,7 +38,7 @@ def noun_chunks(obj): close_app = doc.vocab.strings.add("nk") rbracket = 0 - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if i < rbracket: continue if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index f02619ac9..4317bdeb4 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases. Works on both Doc and Span. """ @@ -14,7 +14,7 @@ def noun_chunks(obj): # obj tag corrects some DEP tagger mistakes. # Further improvement of the models will eliminate the need for this tag. labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -24,7 +24,7 @@ def noun_chunks(obj): nmod = doc.vocab.strings.add("nmod") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 5ff848124..6d366ec90 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -20,7 +20,7 @@ def noun_chunks(obj): "attr", "ROOT", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -29,7 +29,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 0badddca1..5fda35211 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -5,8 +5,8 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...errors import Errors -def noun_chunks(obj): - doc = obj.doc +def noun_chunks(doclike): + doc = doclike.doc if not doc.is_parsed: raise ValueError(Errors.E029) @@ -21,7 +21,7 @@ def noun_chunks(obj): np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] token = doc[0] - while token and token.i < len(doc): + while token and token.i < len(doclike): if token.pos in [PROPN, NOUN, PRON]: left, right = noun_bounds( doc, token, np_left_deps, np_right_deps, stop_deps diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 5ff848124..6d366ec90 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -20,7 +20,7 @@ def noun_chunks(obj): "attr", "ROOT", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -29,7 +29,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 9495dcf1e..2ed2c1b35 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -19,7 +19,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -28,7 +28,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 9495dcf1e..2ed2c1b35 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -19,7 +19,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -28,7 +28,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 9495dcf1e..2ed2c1b35 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -19,7 +19,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -28,7 +28,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 148884efe..84493ae79 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON from ...errors import Errors -def noun_chunks(obj): +def noun_chunks(doclike): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ @@ -20,7 +20,7 @@ def noun_chunks(obj): "nmod", "nmod:poss", ] - doc = obj.doc # Ensure works on both Doc and Span. + doc = doclike.doc # Ensure works on both Doc and Span. if not doc.is_parsed: raise ValueError(Errors.E029) @@ -29,7 +29,7 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() - for i, word in enumerate(obj): + for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 4cfab915f..0c1a56187 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -213,28 +213,28 @@ cdef class Matcher: else: yield doc - def __call__(self, object doc_or_span): + def __call__(self, object doclike): """Find all token sequences matching the supplied pattern. - doc_or_span (Doc or Span): The document to match over. + doclike (Doc or Span): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ - if isinstance(doc_or_span, Doc): - doc = doc_or_span + if isinstance(doclike, Doc): + doc = doclike length = len(doc) - elif isinstance(doc_or_span, Span): - doc = doc_or_span.doc - length = doc_or_span.end - doc_or_span.start + elif isinstance(doclike, Span): + doc = doclike.doc + length = doclike.end - doclike.start else: - raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E156.format()) - matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, + matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) for i, (key, start, end) in enumerate(matches): on_match = self._callbacks.get(key, None) @@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples. @@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt else: nr_extra_attr = 0 extra_attr_values = mem.alloc(length, sizeof(attr_t)) - for i, token in enumerate(doc_or_span): + for i, token in enumerate(doclike): for name, index in extensions.items(): value = token._.get(name) if isinstance(value, basestring): @@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, predicate_cache, - doc_or_span[i], extra_attr_values, predicates) + doclike[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns From c6ec19c844ac8325b40a5e6be9a058882b617915 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 17:30:05 +0200 Subject: [PATCH 127/131] Add missing declaration --- spacy/tests/lang/hy/test_tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py index 424fb886f..3eeb8b54e 100644 --- a/spacy/tests/lang/hy/test_tokenizer.py +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals import pytest From f7d10da555c089a2015fd0101b6198db395d82fc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 May 2020 19:15:57 +0200 Subject: [PATCH 128/131] avoid unnecessary loop to check overlapping noun chunks --- spacy/lang/el/syntax_iterators.py | 16 +++++----------- spacy/lang/en/syntax_iterators.py | 14 ++++---------- spacy/lang/fa/syntax_iterators.py | 14 ++++---------- spacy/lang/fr/syntax_iterators.py | 14 ++++---------- spacy/lang/id/syntax_iterators.py | 14 ++++---------- spacy/lang/nb/syntax_iterators.py | 14 ++++---------- spacy/lang/sv/syntax_iterators.py | 14 ++++---------- 7 files changed, 29 insertions(+), 71 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5d6398aad..b5811c337 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -23,12 +23,12 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") nmod = doc.vocab.strings.add("nmod") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: flag = False @@ -36,15 +36,12 @@ def noun_chunks(obj): # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - w_range = range(word.left_edge.i, potential_nmod.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = potential_nmod.i + 1 yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -53,10 +50,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 0d43ebf37..dbb2d6c9f 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 0d43ebf37..dbb2d6c9f 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 31e3302e9..12d351148 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label From 51715b9f720e115fe91f4684c589c3e5666cec5b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 May 2020 19:56:56 +0200 Subject: [PATCH 129/131] span / noun chunk has +1 because end is exclusive --- spacy/lang/el/syntax_iterators.py | 6 +++--- spacy/lang/en/syntax_iterators.py | 4 ++-- spacy/lang/fa/syntax_iterators.py | 4 ++-- spacy/lang/fr/syntax_iterators.py | 4 ++-- spacy/lang/id/syntax_iterators.py | 4 ++-- spacy/lang/nb/syntax_iterators.py | 4 ++-- spacy/lang/sv/syntax_iterators.py | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 10fa94f8c..4a40e28c2 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -36,12 +36,12 @@ def noun_chunks(doclike): # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - prev_end = potential_nmod.i + 1 + prev_end = potential_nmod.i yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -50,7 +50,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 91152bd50..0f2b28b58 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 91152bd50..0f2b28b58 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 99621e6a9..84d295f96 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label From 0f1beb5ff27bf19e14ddc3a8b80e2521a782c03c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:05:03 +0200 Subject: [PATCH 130/131] Tidy up and avoid absolute spacy imports in core --- spacy/cli/evaluate.py | 3 +-- spacy/kb.pxd | 5 ++--- spacy/kb.pyx | 17 ++++++----------- spacy/language.py | 5 +---- 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 8a84684e5..be994de73 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals, division, print_function import plac -import spacy from timeit import default_timer as timer from wasabi import msg @@ -45,7 +44,7 @@ def evaluate( msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) if model.startswith("blank:"): - nlp = spacy.blank(model.replace("blank:", "")) + nlp = util.get_lang_class(model.replace("blank:", ""))() else: nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d5aa382b1..518ce0f4e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from libc.stdio cimport FILE -from spacy.vocab cimport Vocab +from .vocab cimport Vocab from .typedefs cimport hash_t from .structs cimport KBEntryC, AliasC @@ -113,7 +113,7 @@ cdef class KnowledgeBase: return new_index cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: - """ + """ Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 @@ -169,4 +169,3 @@ cdef class Reader: cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int _read(self, void* value, size_t size) except -1 - diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 36a6dbd93..076f25267 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,23 +1,20 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 -import warnings - -from spacy.errors import Errors, Warnings - -from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from cpython.exc cimport PyErr_SetFromErrno - from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t +from libcpp.vector cimport vector + +import warnings +from os import path +from pathlib import Path from .typedefs cimport hash_t -from os import path -from libcpp.vector cimport vector +from .errors import Errors, Warnings cdef class Candidate: @@ -586,5 +583,3 @@ cdef class Reader: cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status - - diff --git a/spacy/language.py b/spacy/language.py index 0e5c46459..dae7d96a2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,10 +4,7 @@ from __future__ import absolute_import, unicode_literals import random import itertools import warnings - from thinc.extra import load_nlp - -from spacy.util import minibatch import weakref import functools from collections import OrderedDict @@ -852,7 +849,7 @@ class Language(object): *[mp.Pipe(False) for _ in range(n_process)] ) - batch_texts = minibatch(texts, batch_size) + batch_texts = util.minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) From 53da6bd6724d5ab26da597faa275816fa3e1093e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:45:33 +0200 Subject: [PATCH 131/131] Add course to landing [ci skip] --- website/src/styles/landing.module.sass | 1 + website/src/widgets/landing.js | 47 ++++++++++++++------------ 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass index e36e36c0a..c29c0fffb 100644 --- a/website/src/styles/landing.module.sass +++ b/website/src/styles/landing.module.sass @@ -86,6 +86,7 @@ .banner-content-small display: block + margin-bottom: 0 !important .banner-title display: block diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 9aeec0cdc..c96905733 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -9,7 +9,6 @@ import { LandingGrid, LandingCard, LandingCol, - LandingButton, LandingDemo, LandingBannerGrid, LandingBanner, @@ -19,7 +18,8 @@ import { H2 } from '../components/typography' import { Ul, Li } from '../components/list' import Button from '../components/button' import Link from '../components/link' -import irlBackground from '../images/spacy-irl.jpg' + +import courseImage from '../../docs/images/course.jpg' import BenchmarksChoi from 'usage/_benchmarks-choi.md' @@ -148,13 +148,35 @@ const Landing = ({ data }) => { + + + Advanced NLP with spaCy: A free online course + +
+
+ In this free and interactive online course you’ll learn how to + use spaCy to build advanced natural language understanding systems, using both + rule-based and machine learning approaches. It includes{' '} + 55 exercises featuring videos, slide decks, multiple-choice + questions and interactive coding practice in the browser. +
+ Prodigy is an annotation tool so efficient that data scientists @@ -165,25 +187,6 @@ const Landing = ({ data }) => { update your model in real-time and chain models together to build more complex systems. - - - We were pleased to invite the spaCy community and other folks working on Natural - Language Processing to Berlin this summer for a small and intimate event{' '} - July 6, 2019. We booked a beautiful venue, hand-picked an - awesome lineup of speakers and scheduled plenty of social time to get to know - each other and exchange ideas. The YouTube playlist includes 12 talks about NLP - research, development and applications, with keynotes by Sebastian Ruder - (DeepMind) and Yoav Goldberg (Allen AI). -