From 9f740a9891d6c118eeb154dd819dba58d93db8ac Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Feb 2020 14:59:03 +0100 Subject: [PATCH 001/105] Add a few more Danish tokenizer exceptions --- spacy/lang/da/tokenizer_exceptions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index d669fb981..89b083186 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -70,6 +70,7 @@ for orth in [ "A/S", "B.C.", "BK.", + "B.T.", "Dr.", "Boul.", "Chr.", @@ -79,6 +80,7 @@ for orth in [ "Hf.", "i/s", "I/S", + "Inc.", "Kprs.", "L.A.", "Ll.", @@ -149,6 +151,7 @@ for orth in [ "bygn.", "c/o", "ca.", + "cm.", "cand.", "d.d.", "d.m.", @@ -172,10 +175,12 @@ for orth in [ "dl.", "do.", "dobb.", + "dr.", "dr.h.c", "dr.phil.", "ds.", "dvs.", + "d.v.s.", "e.b.", "e.l.", "e.o.", @@ -297,10 +302,14 @@ for orth in [ "kap.", "kbh.", "kem.", + "kg.", + "kgs.", "kgl.", "kl.", "kld.", + "km.", "km/t", + "km/t.", "knsp.", "komm.", "kons.", @@ -311,6 +320,7 @@ for orth in [ "kt.", "ktr.", "kv.", + "kvm.", "kvt.", "l.c.", "lab.", @@ -357,6 +367,7 @@ for orth in [ "nto.", "nuv.", "o/m", + "o/m.", "o.a.", "o.fl.", "o.h.", @@ -526,6 +537,7 @@ for orth in [ "vejl.", "vh.", "vha.", + "vind.", "vs.", "vsa.", "vær.", From 5f680042647ef7d0c71a5041f33558bf81e656d8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:05:00 +0100 Subject: [PATCH 002/105] Port over gitignore changes from develop Prevents stale files when switching branches --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 828258603..edcbba4d5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ corpora/ keys/ *.json.gz +# Tests +spacy/tests/package/setup.cfg +spacy/tests/package/pyproject.toml +spacy/tests/package/requirements.txt + # Website website/.cache/ website/public/ From 1d6aec805d5c03ad8a039466e98ed3a619e650c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:17:20 +0100 Subject: [PATCH 003/105] Fix formatting and update docs for v2.2.4 --- spacy/cli/debug_data.py | 25 ++++++++++++++++--------- website/docs/api/cli.md | 30 ++++++++++++++++++++---------- website/docs/api/doc.md | 22 ++++++++++++---------- website/docs/api/span.md | 30 ++++++++++++++++++++++++++---- website/docs/api/top-level.md | 32 ++++++++++++++++---------------- website/meta/languages.json | 2 ++ 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 0e12a594c..c5e1ff6cf 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000 @plac.annotations( + # fmt: off lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), + pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), + # fmt: on ) def debug_data( lang, @@ -235,13 +232,17 @@ def debug_data( if gold_train_data["ws_ents"]: msg.fail( - "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"]) + "{} invalid whitespace entity span(s)".format( + gold_train_data["ws_ents"] + ) ) has_ws_ents_error = True if gold_train_data["punct_ents"]: msg.warn( - "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"]) + "{} entity span(s) with punctuation".format( + gold_train_data["punct_ents"] + ) ) has_punct_ents_warning = True @@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline): if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 - if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]: + if label.startswith(("B-", "U-", "L-")) and doc[i].text in [ + ".", + "'", + "!", + "?", + ",", + ]: # punctuation entity: could be replaced by whitespace when training with noise, # so add a warning to alert the user to this unexpected side effect. data["punct_ents"] += 1 diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 2f7346491..e47695efb 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -184,16 +184,17 @@ low data labels and more. $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] ``` -| Argument | Type | Description | -| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| Argument | Type | Description | +| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | @@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | | `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--replace-components`, `-R` | flag | Replace components from the base model. | | `--vectors`, `-v` | option | Model to load vectors from. | | `--n-iter`, `-n` | option | Number of iterations (default: `30`). | | `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | @@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | | `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | +| `--width`, `-cw` 2.2.4 | option | Width of CNN layers of `Tok2Vec` component. | +| `--conv-depth`, `-cd` 2.2.4 | option | Depth of CNN layers of `Tok2Vec` component. | +| `--cnn-window`, `-cW` 2.2.4 | option | Window size for CNN layers of `Tok2Vec` component. | +| `--cnn-pieces`, `-cP` 2.2.4 | option | Maxout size for CNN layers of `Tok2Vec` component. | +| `--use-chars`, `-chr` 2.2.4 | flag | Whether to use character-based embedding of `Tok2Vec` component. | +| `--bilstm-depth`, `-lstm` 2.2.4 | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). | +| `--embed-rows`, `-er` 2.2.4 | option | Number of embedding rows of `Tok2Vec` component. | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | @@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). | | `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. | | `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | | `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 4f948e425..87b854a8c 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to -compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs. -The Python-level `Token` and [`Span`](/api/span) objects are views of this -array, i.e. they don't own the data themselves. +compressed binary strings. The `Doc` object holds an array of +[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and +[`Span`](/api/span) objects are views of this array, i.e. they don't own the +data themselves. ## Doc.\_\_init\_\_ {#init tag="method"} @@ -197,13 +198,14 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the Span, e.g. for named entities. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Type | Description | +| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 64b77b89d..3833bbca9 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -172,6 +172,28 @@ Remove a previously registered extension. | `name` | unicode | Name of the extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +## Span.char_span {#char_span tag="method" new="2.2.4"} + +Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if +the character indices don't map to a valid span. + +> #### Example +> +> ```python +> doc = nlp("I like New York") +> span = doc[1:4].char_span(5, 13, label="GPE") +> assert span.text == "New York" +> ``` + +| Name | Type | Description | +| ----------- | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | + ## Span.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Type | Description | -| ----------------- | ----- | ---------------------------------------------------- | -| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | -| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | +| Name | Type | Description | +| ---------------- | ----- | ---------------------------------------------------- | +| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | +| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | ## Span.root {#root tag="property" model="parser"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 266df87f0..217c51794 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | unicode | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} diff --git a/website/meta/languages.json b/website/meta/languages.json index c22ddad69..8834aaddc 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -95,6 +95,8 @@ "has_examples": true }, { "code": "hr", "name": "Croatian", "has_examples": true }, + { "code": "eu", "name": "Basque", "has_examples": true }, + { "code": "yo", "name": "Yoruba", "has_examples": true }, { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, From eccf6b16866defc66db9869603e9597a4ecb82b5 Mon Sep 17 00:00:00 2001 From: Renaud Richardet Date: Mon, 9 Mar 2020 14:49:11 +0100 Subject: [PATCH 004/105] small typo in code sample --- website/docs/usage/rule-based-matching.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index f8866aec1..0ab74034e 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1119,7 +1119,7 @@ entityruler = EntityRuler(nlp) patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)] other_pipes = [p for p in nlp.pipe_names if p != "tagger"] -with nlp.disable_pipes(*disable_pipes): +with nlp.disable_pipes(*other_pipes): entityruler.add_patterns(patterns) ``` From 1724a4f75b3a1ee5ceec39bbaf14b82051c11e90 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 9 Mar 2020 18:08:18 +0100 Subject: [PATCH 005/105] additional information if doc is empty --- spacy/tests/matcher/test_matcher_api.py | 9 ++++++++- spacy/tokens/doc.pyx | 2 +- website/docs/api/doc.md | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index a826a0a0e..74d4b8b00 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -5,7 +5,7 @@ import pytest import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher -from spacy.tokens import Doc, Token +from spacy.tokens import Doc, Token, Span from ..doc.test_underscore import clean_underscore @@ -458,3 +458,10 @@ def test_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) + +def test_matcher_span(matcher): + text = "JavaScript is good but Java is better" + doc = Doc(matcher.vocab, words=text.split()) + span = Span(doc, 0, 3) + matches = matcher(span.as_doc()) + assert len(matches) == 1 \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0c90929c3..ec0cd66b8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -260,7 +260,7 @@ cdef class Doc: def is_nered(self): """Check if the document has named entities set. Will return True if *any* of the tokens has a named entity tag set (even if the others are - unknown values). + unknown values), or if the document is empty. """ if len(self) == 0: return True diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 87b854a8c..ab85c1deb 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -657,10 +657,10 @@ The L2 norm of the document's vector representation. | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | | `lang_` 2.1 | unicode | Language of the document's vocabulary. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | -| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. | +| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | +| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | +| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | +| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | | `sentiment` | float | The document's positivity/negativity score, if available. | | `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | | `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | From c4d030dbf68990e7af6b6a87d6add829906806bf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 9 Mar 2020 18:10:54 +0100 Subject: [PATCH 006/105] remove accidental commit --- spacy/tests/matcher/test_matcher_api.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 74d4b8b00..a826a0a0e 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -5,7 +5,7 @@ import pytest import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher -from spacy.tokens import Doc, Token, Span +from spacy.tokens import Doc, Token from ..doc.test_underscore import clean_underscore @@ -458,10 +458,3 @@ def test_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) - -def test_matcher_span(matcher): - text = "JavaScript is good but Java is better" - doc = Doc(matcher.vocab, words=text.split()) - span = Span(doc, 0, 3) - matches = matcher(span.as_doc()) - assert len(matches) == 1 \ No newline at end of file From ba47d5a5cb29297c653af1543f9dff9039dab449 Mon Sep 17 00:00:00 2001 From: Himanshu Garg <35988194+merrcury@users.noreply.github.com> Date: Tue, 10 Mar 2020 15:03:29 +0530 Subject: [PATCH 007/105] Update LICENSE Year --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 11221f687..87b814ce4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2019 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 27d1300bdb23afc407500f30ae5071889b2cf6de Mon Sep 17 00:00:00 2001 From: Himanshu Garg <35988194+merrcury@users.noreply.github.com> Date: Tue, 10 Mar 2020 15:11:07 +0530 Subject: [PATCH 008/105] Create merrcury.md --- .github/contributors/merrcury.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/merrcury.md diff --git a/.github/contributors/merrcury.md b/.github/contributors/merrcury.md new file mode 100644 index 000000000..056a790eb --- /dev/null +++ b/.github/contributors/merrcury.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Himanshu Garg | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-10 | +| GitHub username | merrcury | +| Website (optional) | | From 26a90f011b8c21dfc06940579479aaff8006ff74 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 12 Mar 2020 11:30:41 +0100 Subject: [PATCH 009/105] Set version to v2.2.4 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 365c2adbb..84dc86aa8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.4.dev0" +__version__ = "2.2.4" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 423849f94a09cb5979e5bb7953c576d6e50b1b3c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 13 Mar 2020 09:25:23 +0100 Subject: [PATCH 010/105] Fix sents comparison in test util Due to changes to `Span` (#5005), spans from different documents are now never equal. Check `Token.is_sent_start` values instead. --- spacy/tests/util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 52768dd41..a0d6273a9 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -116,8 +116,7 @@ def assert_docs_equal(doc1, doc2): assert [t.head.i for t in doc1] == [t.head.i for t in doc2] assert [t.dep for t in doc1] == [t.dep for t in doc2] - if doc1.is_parsed and doc2.is_parsed: - assert [s for s in doc1.sents] == [s for s in doc2.sents] + assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2] assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2] assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2] From a0ffa346c0371c6f2fd7c5ae7e9f5a26e36bfc76 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Fri, 13 Mar 2020 14:07:26 +0100 Subject: [PATCH 011/105] Fix broken link in docs --- website/docs/usage/saving-loading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 70983198f..8e2c30d82 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -94,7 +94,7 @@ docs = list(doc_bin.get_docs(nlp.vocab)) If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as well, which includes the values of -[extension attributes](/processing-pipelines#custom-components-attributes) (if +[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if they're serializable with msgpack). From 9cde7eb08c6f06683f0a0085835f8909ce2c56fe Mon Sep 17 00:00:00 2001 From: nihil <666@nabovarme.dk> Date: Fri, 13 Mar 2020 17:58:29 +0100 Subject: [PATCH 012/105] add spacy_syllables to universe + sign contributor agreement --- .github/contributors/sloev.md | 106 ++++++++++++++++++++++++++++++++++ website/meta/universe.json | 35 +++++++++++ 2 files changed, 141 insertions(+) create mode 100644 .github/contributors/sloev.md diff --git a/.github/contributors/sloev.md b/.github/contributors/sloev.md new file mode 100644 index 000000000..d151d4606 --- /dev/null +++ b/.github/contributors/sloev.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Johannes Valbjørn | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-13 | +| GitHub username | sloev | +| Website (optional) | https://sloev.github.io | diff --git a/website/meta/universe.json b/website/meta/universe.json index 0ff622521..91361e234 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1965,6 +1965,41 @@ }, "category": ["pipeline"], "tags": ["phrase extraction", "ner", "summarization", "graph algorithms", "textrank"] + }, + { + "id": "spacy_syllables", + "title": "Spacy Syllables", + "slogan": "Multilingual syllable annotations", + "description": "Spacy Syllables is a pipeline component that adds multilingual syllable annotations to Tokens. It uses Pyphen under the hood and has support for a long list of languages.", + "github": "sloev/spacy-syllables", + "pip": "spacy_syllables", + "code_example": [ + "import spacy", + "from spacy_syllables import SpacySyllables", + "", + "nlp = spacy.load('en_core_web_sm')", + "syllables = SpacySyllables(nlp)", + "nlp.add_pipe(syllables, after='tagger')", + "", + "doc = nlp('terribly long')", + "", + "data = [", + " (token.text, token._.syllables, token._.syllables_count)", + " for token in doc", + "]", + "", + "assert data == [", + " ('terribly', ['ter', 'ri', 'bly'], 3),", + " ('long', ['long'], 1)", + "]" + ], + "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png", + "author": "Johannes Valbjørn", + "author_links": { + "github": "sloev" + }, + "category": ["pipeline"], + "tags": ["syllables", "multilingual"] } ], From 36e35324759482744e97265e8b89768f4311cb1a Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 02:06:32 +0800 Subject: [PATCH 013/105] Remove unfinished sentence --- website/docs/usage/adding-languages.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 4b12c6be1..70411ec0b 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -622,13 +622,13 @@ categorizer is to use the [`spacy train`](/api/cli#train) command-line utility. In order to use this, you'll need training and evaluation data in the [JSON format](/api/annotation#json-input) spaCy expects for training. -You can now train the model using a corpus for your language annotated with If -your data is in one of the supported formats, the easiest solution might be to -use the [`spacy convert`](/api/cli#convert) command-line utility. This supports -several popular formats, including the IOB format for named entity recognition, -the JSONL format produced by our annotation tool [Prodigy](https://prodi.gy), -and the [CoNLL-U](http://universaldependencies.org/docs/format.html) format used -by the [Universal Dependencies](http://universaldependencies.org/) corpus. +If your data is in one of the supported formats, the easiest solution might be +to use the [`spacy convert`](/api/cli#convert) command-line utility. This +supports several popular formats, including the IOB format for named entity +recognition, the JSONL format produced by our annotation tool +[Prodigy](https://prodi.gy), and the +[CoNLL-U](http://universaldependencies.org/docs/format.html) format used by the +[Universal Dependencies](http://universaldependencies.org/) corpus. One thing to keep in mind is that spaCy expects to train its models from **whole documents**, not just single sentences. If your corpus only contains single From 7c3a4ce933edfe4084005a65e07373e47a9d48cb Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 03:11:17 +0800 Subject: [PATCH 014/105] Missing word in api/cli doc --- website/docs/api/cli.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e47695efb..28dc332ba 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -109,9 +109,9 @@ links) and check whether they are compatible with the currently installed version of spaCy. Should be run after upgrading spaCy via `pip install -U spacy` to ensure that all installed models are can be used with the new version. The command is also useful to detect out-of-sync model links resulting from links -created in different virtual environments. It will a list of models, the -installed versions, the latest compatible version (if out of date) and the -commands for updating. +created in different virtual environments. It will show a list of models and +their installed versions. If any model is out of date, the latest compatible +versions and command for updating are shown. > #### Automated validation > From 2124be100da49b828ce315aa802c79448536fa2b Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 03:14:51 +0800 Subject: [PATCH 015/105] Tweak run-on sentence --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 28dc332ba..f067ba5a7 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -176,7 +176,7 @@ All output files generated by this command are compatible with ## Debug data {#debug-data new="2.2"} -Analyze, debug and validate your training and development data, get useful +Analyze, debug, and validate your training and development data. Get useful stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. From 1ae01684cfa3d0530e687c8d4bcf3cbd44926030 Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 03:24:51 +0800 Subject: [PATCH 016/105] Fill in contributor agreement --- .github/contributors/pinealan.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/pinealan.md diff --git a/.github/contributors/pinealan.md b/.github/contributors/pinealan.md new file mode 100644 index 000000000..699b405e2 --- /dev/null +++ b/.github/contributors/pinealan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alan Chan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-15 | +| GitHub username | pinealan | +| Website (optional) | http://pinealan.xyz | From d2ffb406adf5ddcf68fdd6290c1a556517857392 Mon Sep 17 00:00:00 2001 From: Peter B <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 17 Mar 2020 08:30:29 -0400 Subject: [PATCH 017/105] =?UTF-8?q?add=20gobbli=20to=20spacy-universe=20?= =?UTF-8?q?=F0=9F=A5=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- website/meta/universe.json | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 91361e234..9138f8819 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2000,6 +2000,44 @@ }, "category": ["pipeline"], "tags": ["syllables", "multilingual"] + }, + { + "id": "gobbli", + "title": "gobbli", + "slogan": "Deep learning for text classification doesn't have to be scary", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models.", + "url": "https://github.com/rtiinternational/gobbli", + "github": "rtiinternational/gobbli", + "pip": "gobbli", + "thumb": "https://i.postimg.cc/NGpzhrdr/gobbli-lg.png", + "code_example": [ + "from gobbli.io import PredictInput, TrainInput", + "from gobbli.model.bert import BERT", + "", + "train_input = TrainInput(", + " X_train=['This is a training document.', 'This is another training document.'],", + " y_train=['0', '1'],", + " X_valid=['This is a validation sentence.', 'This is another validation sentence.'],", + " y_valid=['1', '0'],", + ")", + "", + "clf = BERT()", + "", + "# Set up classifier resources -- Docker image, etc.", + "clf.build()", + "", + "# Train model", + "train_output = clf.train(train_input)", + "", + "predict_input = PredictInput(", + " X=['Which class is this document?'],", + " labels=train_output.labels,", + " checkpoint=train_output.checkpoint,", + ")", + "", + "predict_output = clf.predict(predict_input)" + ], + "category": ["standalone"] } ], From b2b01a5c8bfd90a78f4c15e75c5cd60122389bb0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Mar 2020 19:53:31 +0100 Subject: [PATCH 018/105] Update universe.json [ci skip] --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 91361e234..56f4f31a3 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "spacy-stanza", + "title": "spacy-stanza", + "slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy", + "description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)", + "github": "explosion/spacy-stanza", + "thumb": "https://i.imgur.com/myhLjMJ.png", + "code_example": [ + "import stanza", + "from spacy_stanza import StanzaLanguage", + "", + "snlp = stanza.Pipeline(lang=\"en\")", + "nlp = StanzaLanguage(snlp)", + "", + "doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")", + "for token in doc:", + " print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)", + "print(doc.ents)" + ], + "category": ["pipeline", "standalone", "models", "research"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, { "id": "spacy-server", "title": "spaCy Server", From b04057c204882da80b7475a2fe78fa0f62b929a0 Mon Sep 17 00:00:00 2001 From: Peter B <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 17 Mar 2020 15:03:43 -0400 Subject: [PATCH 019/105] add mentions of spaCy use --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 9138f8819..c27f1b468 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2005,7 +2005,7 @@ "id": "gobbli", "title": "gobbli", "slogan": "Deep learning for text classification doesn't have to be scary", - "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models.", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. SpaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models available. In addition, spaCy is used for data augmentation and document embeddings.", "url": "https://github.com/rtiinternational/gobbli", "github": "rtiinternational/gobbli", "pip": "gobbli", From eda6eff8b10d9800199b160350e6d6f9d40521ca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Mar 2020 22:19:29 +0100 Subject: [PATCH 020/105] Update universe.json [ci skip] --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index d7c458c36..a1ae388a2 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2032,7 +2032,7 @@ "id": "gobbli", "title": "gobbli", "slogan": "Deep learning for text classification doesn't have to be scary", - "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. SpaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models available. In addition, spaCy is used for data augmentation and document embeddings.", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. spaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models. In addition, spaCy is used for data augmentation and document embeddings.", "url": "https://github.com/rtiinternational/gobbli", "github": "rtiinternational/gobbli", "pip": "gobbli", From 80e7e1347eb59b739503184fa4d69814f6f07954 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Mar 2020 22:21:34 +0100 Subject: [PATCH 021/105] Update universe.json [ci skip] --- website/meta/universe.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index a1ae388a2..23d052bb9 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -6,6 +6,7 @@ "slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy", "description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)", "github": "explosion/spacy-stanza", + "pip": "spacy-stanza", "thumb": "https://i.imgur.com/myhLjMJ.png", "code_example": [ "import stanza", From 3b53617a69287c45284d0aedc4c7fefcaa631662 Mon Sep 17 00:00:00 2001 From: Baciccin Date: Thu, 19 Mar 2020 21:20:17 -0700 Subject: [PATCH 022/105] Add Ligurian language --- .github/contributors/Baciccin.md | 106 +++++++++++++++++++++++++ spacy/lang/lij/__init__.py | 31 ++++++++ spacy/lang/lij/examples.py | 18 +++++ spacy/lang/lij/punctuation.py | 15 ++++ spacy/lang/lij/stop_words.py | 43 ++++++++++ spacy/lang/lij/tokenizer_exceptions.py | 52 ++++++++++++ website/meta/languages.json | 6 ++ 7 files changed, 271 insertions(+) create mode 100644 .github/contributors/Baciccin.md create mode 100644 spacy/lang/lij/__init__.py create mode 100644 spacy/lang/lij/examples.py create mode 100644 spacy/lang/lij/punctuation.py create mode 100644 spacy/lang/lij/stop_words.py create mode 100644 spacy/lang/lij/tokenizer_exceptions.py diff --git a/.github/contributors/Baciccin.md b/.github/contributors/Baciccin.md new file mode 100644 index 000000000..c7a940cb5 --- /dev/null +++ b/.github/contributors/Baciccin.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Giovanni Battista Parodi | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-19 | +| GitHub username | Baciccin | +| Website (optional) | | diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py new file mode 100644 index 000000000..9b4b29798 --- /dev/null +++ b/spacy/lang/lij/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class LigurianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lij" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + + +class Ligurian(Language): + lang = "lij" + Defaults = LigurianDefaults + + +__all__ = ["Ligurian"] diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py new file mode 100644 index 000000000..c4034ae7e --- /dev/null +++ b/spacy/lang/lij/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.lij.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Sciusciâ e sciorbî no se peu.", + "Graçie di çetroin, che me son arrivæ.", + "Vegnime apreuvo, che ve fasso pescâ di òmmi.", + "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.", +] diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py new file mode 100644 index 000000000..4439376c8 --- /dev/null +++ b/spacy/lang/lij/punctuation.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import ALPHA + + +ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") + + +_infixes = TOKENIZER_INFIXES + [ + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) +] + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py new file mode 100644 index 000000000..7ab34adf1 --- /dev/null +++ b/spacy/lang/lij/stop_words.py @@ -0,0 +1,43 @@ +# coding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set( + """ +a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei + +bella belle belli bello ben + +ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse + +d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo + +é e ê ea ean emmo en ëse + +fin fiña + +gh' ghe guæei + +i î in insemme int' inta inte inti into + +l' lê lì lô + +m' ma manco me megio meno mezo mi + +na n' ne ni ninte nisciun nisciuña no + +o ò ô oua + +parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio + +quæ quand' quande quarche quella quelle quelli quello + +s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto + +tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto + +un uña unn' unna + +za zu +""".split() +) diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py new file mode 100644 index 000000000..2aa6f8304 --- /dev/null +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -0,0 +1,52 @@ +# coding: utf8 +from __future__ import unicode_literals +from ...symbols import ORTH, LEMMA + +_exc = {} + +for raw, lemma in [ + ("a-a", "a-o"), + ("a-e", "a-o"), + ("a-o", "a-o"), + ("a-i", "a-o"), + ("co-a", "co-o"), + ("co-e", "co-o"), + ("co-i", "co-o"), + ("co-o", "co-o"), + ("da-a", "da-o"), + ("da-e", "da-o"), + ("da-i", "da-o"), + ("da-o", "da-o"), + ("pe-a", "pe-o"), + ("pe-e", "pe-o"), + ("pe-i", "pe-o"), + ("pe-o", "pe-o"), +]: + for orth in [raw, raw.capitalize()]: + _exc[orth] = [{ORTH: orth, LEMMA: lemma}] + +# Prefix + prepositions with à (e.g. "sott'a-o") + +for prep, prep_lemma in [ + ("a-a", "a-o"), + ("a-e", "a-o"), + ("a-o", "a-o"), + ("a-i", "a-o"), +]: + for prefix, prefix_lemma in [ + ("sott'", "sotta"), + ("sott’", "sotta"), + ("contr'", "contra"), + ("contr’", "contra"), + ("ch'", "che"), + ("ch’", "che"), + ("s'", "se"), + ("s’", "se"), + ]: + for prefix_orth in [prefix, prefix.capitalize()]: + _exc[prefix_orth+prep] = [ + {ORTH: prefix_orth, LEMMA: prefix_lemma}, + {ORTH: prep, LEMMA: prep_lemma}, + ] + +TOKENIZER_EXCEPTIONS = _exc diff --git a/website/meta/languages.json b/website/meta/languages.json index 8834aaddc..41c1bce7f 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -181,6 +181,12 @@ "name": "Vietnamese", "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] }, + { + "code": "lij", + "name": "Ligurian", + "example": "Sta chì a l'é unna fraxe.", + "has_examples": true + }, { "code": "xx", "name": "Multi-language", From b52e1ab677c61c20f7b6985461a78193f4c7a8bb Mon Sep 17 00:00:00 2001 From: nlptechbook <60931109+nlptechbook@users.noreply.github.com> Date: Sat, 21 Mar 2020 11:39:15 -0400 Subject: [PATCH 023/105] Update universe.json A bot powered by Clarifai Predict API and spaCy. Can be found in Telegram messenger at @pic2phrase_bot --- website/meta/universe.json | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 23d052bb9..8f8bcfecd 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1619,6 +1619,19 @@ }, "category": ["standalone", "research"] }, + { + "id": "pic2phrase_bot", + "title": "pic2phrase_bot: Photo Description Generator", + "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", + "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy." + "thumb": "https://drive.google.com/open?id=1GTrpPzc8j4mAmYCJZibYrADAp0GWcVHd", + "image": "https://drive.google.com/open?id=1t7URKJ-4uOJmZb_GbNvw-F5LLtvEoBRy", + "author": "Yuli Vasiliev", + "author_links": { + "twitter": "VasilievYuli", + }, + "category": ["standalone", "research"] + }, { "id": "gracyql", "title": "gracyql", From 2897a73559ca1663d0e258604686e0134b9095d0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Mar 2020 19:23:47 +0100 Subject: [PATCH 024/105] Improve German tokenizer settings style --- spacy/lang/de/punctuation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index c376ce597..da6ab1d40 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import _prefixes, _suffixes +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES -_prefixes = ["``",] + list(_prefixes) +_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES _suffixes = ( ["''", "/"] From 30d862d4d891f0314cf5732aa798019f4b112369 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Mar 2020 19:52:57 +0100 Subject: [PATCH 025/105] Update from macOS-10.13 to macOS-10.14 --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 054365336..147d2e903 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,7 +48,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.6' Python36Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.6' # Don't test on 3.7 for now to speed up builds # Python37Linux: @@ -67,7 +67,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.8' Python38Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.8' maxParallel: 4 pool: From f8b4407a29df5cbf85f5b4179c8b4c1cdd847ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Gilli=C3=9Fen?= Date: Tue, 24 Mar 2020 10:22:12 +0100 Subject: [PATCH 026/105] Remove max_length parameter The parameter max_length is deprecated in PhraseMatcher, as stated here: https://spacy.io/api/phrasematcher#init --- examples/information_extraction/phrase_matcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py index cc6f46055..f3622bfdd 100644 --- a/examples/information_extraction/phrase_matcher.py +++ b/examples/information_extraction/phrase_matcher.py @@ -88,8 +88,8 @@ def read_text(bz2_loc, n=10000): break -def get_matches(tokenizer, phrases, texts, max_length=6): - matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) +def get_matches(tokenizer, phrases, texts): + matcher = PhraseMatcher(tokenizer.vocab) matcher.add("Phrase", None, *phrases) for text in texts: doc = tokenizer(text) From 5d067bcc5e480ed6b446e0e80ddf97b6a42cc80e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Gilli=C3=9Fen?= Date: Tue, 24 Mar 2020 10:42:10 +0100 Subject: [PATCH 027/105] Add SCA for guerda --- .github/contributors/guerda.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/guerda.md diff --git a/.github/contributors/guerda.md b/.github/contributors/guerda.md new file mode 100644 index 000000000..86eedd528 --- /dev/null +++ b/.github/contributors/guerda.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Philip Gillißen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-24 | +| GitHub username | guerda | +| Website (optional) | | From 128acb9ee143ee6888e05ec00aa78e7e44f97f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Gilli=C3=9Fen?= Date: Tue, 24 Mar 2020 10:42:30 +0100 Subject: [PATCH 028/105] Update guerda.md --- .github/contributors/guerda.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/contributors/guerda.md b/.github/contributors/guerda.md index 86eedd528..6ac418e2e 100644 --- a/.github/contributors/guerda.md +++ b/.github/contributors/guerda.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [ ] I am signing on behalf of myself as an individual and no other person + * [x] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. From 79737adb90f286ca5b9be6e1020ea5b1855eed58 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 2 Dec 2019 13:48:27 +0100 Subject: [PATCH 029/105] Improved tokenization for UD_Norwegian-Bokmaal --- spacy/lang/nb/__init__.py | 5 ++ spacy/lang/nb/punctuation.py | 50 ++++++++++++--- spacy/lang/nb/tokenizer_exceptions.py | 92 +++++++++++++++++++-------- 3 files changed, 109 insertions(+), 38 deletions(-) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 086761f82..e6c58b7de 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES from .syntax_iterators import SYNTAX_ITERATORS @@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults): Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + prefixes = TOKENIZER_PREFIXES + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS morph_rules = MORPH_RULES tag_map = TAG_MAP diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index b49aa9838..7672809ec 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -1,16 +1,33 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import TOKENIZER_SUFFIXES +from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY -# Punctuation stolen from Danish + +# Punctuation adapted from Danish _quotes = CONCAT_QUOTES.replace("'", "") +_list_punct = [x for x in LIST_PUNCT if x != "#"] +_list_icons = [x for x in LIST_ICONS if x != "°"] +_list_icons = [x.replace("\\u00B0", "") for x in _list_icons] +_list_quotes = [x for x in LIST_QUOTES if x != "\\'"] + + +_prefixes = ( + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + + _list_punct + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) + + _infixes = ( LIST_ELLIPSES - + LIST_ICONS + + _list_icons + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), @@ -21,13 +38,26 @@ _infixes = ( ] ) -_suffixes = [ - suffix - for suffix in TOKENIZER_SUFFIXES - if suffix not in ["'s", "'S", "’s", "’S", r"\'"] -] -_suffixes += [r"(?<=[^sSxXzZ])\'"] +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + _list_quotes + + _list_icons + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] + + [r"(?<=[^sSxXzZ])'"] +) +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 92ac09841..3f4aa79f6 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -24,57 +24,80 @@ for exc_data in [ for orth in [ - "adm.dir.", - "a.m.", - "andelsnr", + "Ap.", "Aq.", + "Ca.", + "Chr.", + "Co.", + "Co.", + "Dr.", + "F.eks.", + "Fr.p.", + "Frp.", + "Grl.", + "Kr.", + "Kr.F.", + "Kr.F.s", + "Mr.", + "Mrs.", + "Pb.", + "Pr.", + "Sp.", + "Sp.", + "St.", + "a.m.", + "ad.", + "adm.dir.", + "andelsnr", "b.c.", "bl.a.", "bla.", "bm.", "bnr.", "bto.", + "c.c.", "ca.", "cand.mag.", - "c.c.", "co.", "d.d.", - "dept.", "d.m.", - "dr.philos.", - "dvs.", "d.y.", - "E. coli", + "dept.", + "dr.", + "dr.med.", + "dr.philos.", + "dr.psychol.", + "dvs.", + "e.Kr.", + "e.l.", "eg.", "ekskl.", - "e.Kr.", "el.", - "e.l.", "et.", "etc.", "etg.", "ev.", "evt.", "f.", + "f.Kr.", "f.eks.", + "f.o.m.", "fhv.", "fk.", - "f.Kr.", - "f.o.m.", "foreg.", "fork.", "fv.", "fvt.", "g.", - "gt.", "gl.", "gno.", "gnr.", "grl.", + "gt.", + "h.r.adv.", "hhv.", "hoh.", "hr.", - "h.r.adv.", "ifb.", "ifm.", "iht.", @@ -83,39 +106,45 @@ for orth in [ "jf.", "jr.", "jun.", + "juris.", "kfr.", + "kgl.", "kgl.res.", "kl.", "komm.", "kr.", "kst.", + "lat.", "lø.", + "m.a.o.", + "m.fl.", + "m.m.", + "m.v.", "ma.", "mag.art.", - "m.a.o.", "md.", "mfl.", + "mht.", "mill.", "min.", - "m.m.", "mnd.", "moh.", - "Mr.", + "mrd.", "muh.", "mv.", "mva.", + "n.å.", "ndf.", "no.", "nov.", "nr.", "nto.", "nyno.", - "n.å.", "o.a.", + "o.l.", "off.", "ofl.", "okt.", - "o.l.", "on.", "op.", "org.", @@ -123,14 +152,15 @@ for orth in [ "ovf.", "p.", "p.a.", - "Pb.", + "p.g.a.", + "p.m.", + "p.t.", "pga.", "ph.d.", "pkt.", - "p.m.", "pr.", "pst.", - "p.t.", + "pt.", "red.anm.", "ref.", "res.", @@ -139,6 +169,10 @@ for orth in [ "rv.", "s.", "s.d.", + "s.k.", + "s.k.", + "s.u.", + "s.å.", "sen.", "sep.", "siviling.", @@ -148,16 +182,17 @@ for orth in [ "sr.", "sst.", "st.", - "stip.", - "stk.", "st.meld.", "st.prp.", + "stip.", + "stk.", "stud.", - "s.u.", "sv.", - "sø.", - "s.å.", "såk.", + "sø.", + "t.h.", + "t.o.m.", + "t.v.", "temp.", "ti.", "tils.", @@ -165,7 +200,6 @@ for orth in [ "tl;dr", "tlf.", "to.", - "t.o.m.", "ult.", "utg.", "v.", @@ -179,8 +213,10 @@ for orth in [ "vol.", "vs.", "vsa.", + "©NTB", "årg.", "årh.", + "§§", ]: _exc[orth] = [{ORTH: orth}] From cba2d1d972239bae86fcd5a0b3bd5e8ede04af9c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 25 Mar 2020 09:39:26 +0100 Subject: [PATCH 030/105] Disable failing abbreviation test UD_Danish-DDT has (as far as I can tell) hallucinated periods after abbreviations, so the changes are an artifact of the corpus and not due to anything meaningful about Danish tokenization. --- spacy/tests/lang/da/test_exceptions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index a522ab5e8..f98030621 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -58,7 +58,8 @@ def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): ("Kristiansen c/o Madsen", 3), ("Sprogteknologi a/s", 2), ("De boede i A/B Bellevue", 5), - ("Rotorhastigheden er 3400 o/m.", 5), + # note: skipping due to weirdness in UD_Danish-DDT + #("Rotorhastigheden er 3400 o/m.", 5), ("Jeg købte billet t/r.", 5), ("Murerarbejdsmand m/k søges", 3), ("Netværket kører over TCP/IP", 4), From 4117a5c7056a65aafb29db137b4f52b264d915fc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:27:42 +0100 Subject: [PATCH 031/105] Improve French tokenization (#5202) Improve French tokenization for UD_French-Sequoia. --- spacy/lang/fr/__init__.py | 4 ++- spacy/lang/fr/punctuation.py | 19 +++++++--- spacy/lang/fr/tokenizer_exceptions.py | 50 +++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f56c8688a..7727aff0e 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH -from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES token_match = TOKEN_MATCH diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..e03e91361 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,15 +1,24 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES +from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import merge_chars -ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") +ELISION = "' ’".replace(" ", "") +HYPHENS = r"- – — ‐ ‑".replace(" ", "") +_prefixes_elision = "d l n" +_prefixes_elision += " " + _prefixes_elision.upper() +_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous" +_hyphen_suffixes += " " + _hyphen_suffixes.upper() +_prefixes = TOKENIZER_PREFIXES + [ + r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)) +] + _suffixes = ( LIST_PUNCT + LIST_ELLIPSES @@ -17,7 +26,6 @@ _suffixes = ( + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."] - r"(?<=[0-9])°[FfCcKk]", # 4°C -> ["4", "°C"] r"(?<=[0-9])%", # 4% -> ["4", "%"] r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), @@ -25,14 +33,15 @@ _suffixes = ( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)), ] ) - _infixes = TOKENIZER_INFIXES + [ r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) ] +TOKENIZER_PREFIXES = _prefixes TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4b3b2c908..56c5544a5 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -9,7 +9,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA, TAG # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer -# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS +#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"] @@ -56,7 +56,28 @@ for exc_data in [ _exc[exc_data[ORTH]] = [exc_data] -for orth in ["etc."]: +for orth in [ + "après-midi", + "au-delà", + "au-dessus", + "celle-ci", + "celles-ci", + "celui-ci", + "cf.", + "ci-dessous", + "elle-même", + "en-dessous", + "etc.", + "jusque-là", + "lui-même", + "MM.", + "No.", + "peut-être", + "pp.", + "quelques-uns", + "rendez-vous", + "Vol.", +]: _exc[orth] = [{ORTH: orth}] @@ -72,7 +93,7 @@ for verb, verb_lemma in [ for pronoun in ["elle", "il", "on"]: token = "{}-t-{}".format(orth, pronoun) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, {LEMMA: pronoun, ORTH: "-" + pronoun}, ] @@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: token = "{}-ce".format(orth) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, ] @@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]: for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for orth in [pre, pre.title()]: _exc["%sest-ce" % orth] = [ - {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, - {LEMMA: "être", ORTH: "est", TAG: "VERB"}, + {LEMMA: pre_lemma, ORTH: orth}, + {LEMMA: "être", ORTH: "est"}, {LEMMA: "ce", ORTH: "-ce"}, ] +for verb, pronoun in [("est", "il"), ("EST", "IL")]: + token = "{}-{}".format(verb, pronoun) + _exc[token] = [ + {LEMMA: "être", ORTH: verb}, + {LEMMA: pronoun, ORTH: "-" + pronoun}, + ] + + +for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: + token = "{}'{}-{}".format(s, verb, pronoun) + _exc[token] = [ + {LEMMA: "se", ORTH: s + "'"}, + {LEMMA: "être", ORTH: verb}, + {LEMMA: pronoun, ORTH: "-" + pronoun}, + ] + + _infixes_exc = [] orig_elision = "'" orig_hyphen = "-" From 923a453449d7bc236e72ba23286845aba5ab3fe3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:27:53 +0100 Subject: [PATCH 032/105] Modifications/updates to Portuguese tokenization (#5203) Modifications to Portuguese tokenization for UD_Portuguese-Bosque. Instead of splitting contactions as exceptions, they are kept as merged tokens. --- spacy/lang/pt/tokenizer_exceptions.py | 60 +++++++++------------------ 1 file changed, 19 insertions(+), 41 deletions(-) diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 5169780e6..c36af6771 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -4,69 +4,47 @@ from __future__ import unicode_literals from ...symbols import ORTH, NORM -_exc = { - "às": [{ORTH: "à", NORM: "a"}, {ORTH: "s", NORM: "as"}], - "ao": [{ORTH: "a"}, {ORTH: "o"}], - "aos": [{ORTH: "a"}, {ORTH: "os"}], - "àquele": [{ORTH: "à", NORM: "a"}, {ORTH: "quele", NORM: "aquele"}], - "àquela": [{ORTH: "à", NORM: "a"}, {ORTH: "quela", NORM: "aquela"}], - "àqueles": [{ORTH: "à", NORM: "a"}, {ORTH: "queles", NORM: "aqueles"}], - "àquelas": [{ORTH: "à", NORM: "a"}, {ORTH: "quelas", NORM: "aquelas"}], - "àquilo": [{ORTH: "à", NORM: "a"}, {ORTH: "quilo", NORM: "aquilo"}], - "aonde": [{ORTH: "a"}, {ORTH: "onde"}], -} - - -# Contractions -_per_pron = ["ele", "ela", "eles", "elas"] -_dem_pron = [ - "este", - "esta", - "estes", - "estas", - "isto", - "esse", - "essa", - "esses", - "essas", - "isso", - "aquele", - "aquela", - "aqueles", - "aquelas", - "aquilo", -] -_und_pron = ["outro", "outra", "outros", "outras"] -_adv = ["aqui", "aí", "ali", "além"] - - -for orth in _per_pron + _dem_pron + _und_pron + _adv: - _exc["d" + orth] = [{ORTH: "d", NORM: "de"}, {ORTH: orth}] - -for orth in _per_pron + _dem_pron + _und_pron: - _exc["n" + orth] = [{ORTH: "n", NORM: "em"}, {ORTH: orth}] +_exc = {} for orth in [ "Adm.", + "Art.", + "art.", + "Av.", + "av.", + "Cia.", + "dom.", "Dr.", + "dr.", "e.g.", "E.g.", "E.G.", + "e/ou", + "ed.", + "eng.", + "etc.", + "Fund.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", + "Inc.", "Jr.", + "km/h", "Ltd.", + "Mr.", "p.m.", "Ph.D.", "Rep.", "Rev.", + "S/A", "Sen.", "Sr.", + "sr.", "Sra.", + "sra.", "vs.", "tel.", "pág.", From 1a944e5976b260f8ee42a52fb016808f427ef77f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:28:02 +0100 Subject: [PATCH 033/105] Improve Italian tokenization (#5204) Improve Italian tokenization for UD_Italian-ISDT. --- spacy/lang/it/__init__.py | 3 +- spacy/lang/it/punctuation.py | 36 +++++++++++++++---- spacy/lang/it/tokenizer_exceptions.py | 52 ++++++++++++++++++++++++++- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 90763eda5..06d146748 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4fa931fde..f2c1fd84a 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,15 +1,39 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import ALPHA +from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER -ELISION = " ' ’ ".strip().replace(" ", "") +ELISION = "'’" -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] +_prefixes = ( + [ + r"'[0-9][0-9]", + r"[0-9]+°", + ] + + TOKENIZER_PREFIXES +) + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), + r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) + ] +) + +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 62f568c5c..70dfe92bd 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -2,6 +2,56 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA -_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} +_exc = { + "all'art.": [{ORTH: "all'"}, {ORTH: "art."}], + "dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}], + "dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}], + "L'art.": [{ORTH: "L'"}, {ORTH: "art."}], + "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], + "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], + "po'": [{ORTH: "po'", LEMMA: "poco"}], + "sett..": [{ORTH: "sett."}, {ORTH: "."}] +} + +for orth in [ + "..", + "....", + "al.", + "all-path", + "art.", + "Art.", + "artt.", + "att.", + "by-pass", + "c.d.", + "centro-sinistra", + "check-up", + "Civ.", + "cm.", + "Cod.", + "col.", + "Cost.", + "d.C.", + 'de"' + "distr.", + "E'", + "ecc.", + "e-mail", + "e/o", + "etc.", + "Jr.", + "n°", + "nord-est", + "pag.", + "Proc.", + "prof.", + "sett.", + "s.p.a.", + "ss.", + "St.", + "tel.", + "week-end", +]: + _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = _exc From 86c43e55fa3a9557e838998bc288bb4833c2d0ec Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:28:12 +0100 Subject: [PATCH 034/105] Improve Lithuanian tokenization (#5205) * Improve Lithuanian tokenization Modify Lithuanian tokenization to improve performance for UD_Lithuanian-ALKSNIS. * Update Lithuanian tokenizer tests --- spacy/lang/lt/__init__.py | 7 +- spacy/lang/lt/punctuation.py | 29 ++ spacy/lang/lt/tokenizer_exceptions.py | 514 +++++++++++++------------- spacy/tests/lang/lt/test_text.py | 6 +- 4 files changed, 296 insertions(+), 260 deletions(-) create mode 100644 spacy/lang/lt/punctuation.py diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 7919a4858..1dfe932ee 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -26,7 +27,11 @@ class LithuanianDefaults(Language.Defaults): ) lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES + mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")} + del mod_base_exceptions["8)"] + tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP morph_rules = MORPH_RULES diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py new file mode 100644 index 000000000..5eedc8116 --- /dev/null +++ b/spacy/lang/lt/punctuation.py @@ -0,0 +1,29 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_ICONS, LIST_ELLIPSES +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA +from ..char_classes import HYPHENS +from ..punctuation import TOKENIZER_SUFFIXES + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +_suffixes = ["\."] + list(TOKENIZER_SUFFIXES) + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index fcf807278..f8e11156d 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -6,262 +6,264 @@ from ...symbols import ORTH _exc = {} for orth in [ - "G.", - "J. E.", - "J. Em.", - "J.E.", - "J.Em.", - "K.", - "N.", - "V.", - "Vt.", - "a.", - "a.k.", - "a.s.", - "adv.", - "akad.", - "aklg.", - "akt.", - "al.", - "ang.", - "angl.", - "aps.", - "apskr.", - "apyg.", - "arbat.", - "asist.", - "asm.", - "asm.k.", - "asmv.", - "atk.", - "atsak.", - "atsisk.", - "atsisk.sąsk.", - "atv.", - "aut.", - "avd.", - "b.k.", - "baud.", - "biol.", - "bkl.", - "bot.", - "bt.", - "buv.", - "ch.", - "chem.", - "corp.", - "d.", - "dab.", - "dail.", - "dek.", - "deš.", - "dir.", - "dirig.", - "doc.", - "dol.", - "dr.", - "drp.", - "dvit.", - "dėst.", - "dš.", - "dž.", - "e.b.", - "e.bankas", - "e.p.", - "e.parašas", - "e.paštas", - "e.v.", - "e.valdžia", - "egz.", - "eil.", - "ekon.", - "el.", - "el.bankas", - "el.p.", - "el.parašas", - "el.paštas", - "el.valdžia", - "etc.", - "ež.", - "fak.", - "faks.", - "feat.", - "filol.", - "filos.", - "g.", - "gen.", - "geol.", - "gerb.", - "gim.", - "gr.", - "gv.", - "gyd.", - "gyv.", - "habil.", - "inc.", - "insp.", - "inž.", - "ir pan.", - "ir t. t.", - "isp.", - "istor.", - "it.", - "just.", - "k.", - "k. a.", - "k.a.", - "kab.", - "kand.", - "kart.", - "kat.", - "ketv.", - "kh.", - "kl.", - "kln.", - "km.", - "kn.", - "koresp.", - "kpt.", - "kr.", - "kt.", - "kub.", - "kun.", - "kv.", - "kyš.", - "l. e. p.", - "l.e.p.", - "lenk.", - "liet.", - "lot.", - "lt.", - "ltd.", - "ltn.", - "m.", - "m.e..", - "m.m.", - "mat.", - "med.", - "mgnt.", - "mgr.", - "min.", - "mjr.", - "ml.", - "mln.", - "mlrd.", - "mob.", - "mok.", - "moksl.", - "mokyt.", - "mot.", - "mr.", - "mst.", - "mstl.", - "mėn.", - "nkt.", - "no.", - "nr.", - "ntk.", - "nuotr.", - "op.", - "org.", - "orig.", - "p.", - "p.d.", - "p.m.e.", - "p.s.", - "pab.", - "pan.", - "past.", - "pav.", - "pavad.", - "per.", - "perd.", - "pirm.", - "pl.", - "plg.", - "plk.", - "pr.", - "pr.Kr.", - "pranc.", - "proc.", - "prof.", - "prom.", - "prot.", - "psl.", - "pss.", - "pvz.", - "pšt.", - "r.", - "raj.", - "red.", - "rez.", - "rež.", - "rus.", - "rš.", - "s.", - "sav.", - "saviv.", - "sek.", - "sekr.", - "sen.", - "sh.", - "sk.", - "skg.", - "skv.", - "skyr.", - "sp.", - "spec.", - "sr.", - "st.", - "str.", - "stud.", - "sąs.", - "t.", - "t. p.", - "t. y.", - "t.p.", - "t.t.", - "t.y.", - "techn.", - "tel.", - "teol.", - "th.", - "tir.", - "trit.", - "trln.", - "tšk.", - "tūks.", - "tūkst.", - "up.", - "upl.", - "v.s.", - "vad.", - "val.", - "valg.", - "ved.", - "vert.", - "vet.", - "vid.", - "virš.", - "vlsč.", - "vnt.", - "vok.", - "vs.", - "vtv.", - "vv.", - "vyr.", - "vyresn.", - "zool.", - "Įn", - "įl.", - "š.m.", - "šnek.", - "šv.", - "švč.", - "ž.ū.", - "žin.", - "žml.", - "žr.", + "n-tosios", + "?!", +# "G.", +# "J. E.", +# "J. Em.", +# "J.E.", +# "J.Em.", +# "K.", +# "N.", +# "V.", +# "Vt.", +# "a.", +# "a.k.", +# "a.s.", +# "adv.", +# "akad.", +# "aklg.", +# "akt.", +# "al.", +# "ang.", +# "angl.", +# "aps.", +# "apskr.", +# "apyg.", +# "arbat.", +# "asist.", +# "asm.", +# "asm.k.", +# "asmv.", +# "atk.", +# "atsak.", +# "atsisk.", +# "atsisk.sąsk.", +# "atv.", +# "aut.", +# "avd.", +# "b.k.", +# "baud.", +# "biol.", +# "bkl.", +# "bot.", +# "bt.", +# "buv.", +# "ch.", +# "chem.", +# "corp.", +# "d.", +# "dab.", +# "dail.", +# "dek.", +# "deš.", +# "dir.", +# "dirig.", +# "doc.", +# "dol.", +# "dr.", +# "drp.", +# "dvit.", +# "dėst.", +# "dš.", +# "dž.", +# "e.b.", +# "e.bankas", +# "e.p.", +# "e.parašas", +# "e.paštas", +# "e.v.", +# "e.valdžia", +# "egz.", +# "eil.", +# "ekon.", +# "el.", +# "el.bankas", +# "el.p.", +# "el.parašas", +# "el.paštas", +# "el.valdžia", +# "etc.", +# "ež.", +# "fak.", +# "faks.", +# "feat.", +# "filol.", +# "filos.", +# "g.", +# "gen.", +# "geol.", +# "gerb.", +# "gim.", +# "gr.", +# "gv.", +# "gyd.", +# "gyv.", +# "habil.", +# "inc.", +# "insp.", +# "inž.", +# "ir pan.", +# "ir t. t.", +# "isp.", +# "istor.", +# "it.", +# "just.", +# "k.", +# "k. a.", +# "k.a.", +# "kab.", +# "kand.", +# "kart.", +# "kat.", +# "ketv.", +# "kh.", +# "kl.", +# "kln.", +# "km.", +# "kn.", +# "koresp.", +# "kpt.", +# "kr.", +# "kt.", +# "kub.", +# "kun.", +# "kv.", +# "kyš.", +# "l. e. p.", +# "l.e.p.", +# "lenk.", +# "liet.", +# "lot.", +# "lt.", +# "ltd.", +# "ltn.", +# "m.", +# "m.e..", +# "m.m.", +# "mat.", +# "med.", +# "mgnt.", +# "mgr.", +# "min.", +# "mjr.", +# "ml.", +# "mln.", +# "mlrd.", +# "mob.", +# "mok.", +# "moksl.", +# "mokyt.", +# "mot.", +# "mr.", +# "mst.", +# "mstl.", +# "mėn.", +# "nkt.", +# "no.", +# "nr.", +# "ntk.", +# "nuotr.", +# "op.", +# "org.", +# "orig.", +# "p.", +# "p.d.", +# "p.m.e.", +# "p.s.", +# "pab.", +# "pan.", +# "past.", +# "pav.", +# "pavad.", +# "per.", +# "perd.", +# "pirm.", +# "pl.", +# "plg.", +# "plk.", +# "pr.", +# "pr.Kr.", +# "pranc.", +# "proc.", +# "prof.", +# "prom.", +# "prot.", +# "psl.", +# "pss.", +# "pvz.", +# "pšt.", +# "r.", +# "raj.", +# "red.", +# "rez.", +# "rež.", +# "rus.", +# "rš.", +# "s.", +# "sav.", +# "saviv.", +# "sek.", +# "sekr.", +# "sen.", +# "sh.", +# "sk.", +# "skg.", +# "skv.", +# "skyr.", +# "sp.", +# "spec.", +# "sr.", +# "st.", +# "str.", +# "stud.", +# "sąs.", +# "t.", +# "t. p.", +# "t. y.", +# "t.p.", +# "t.t.", +# "t.y.", +# "techn.", +# "tel.", +# "teol.", +# "th.", +# "tir.", +# "trit.", +# "trln.", +# "tšk.", +# "tūks.", +# "tūkst.", +# "up.", +# "upl.", +# "v.s.", +# "vad.", +# "val.", +# "valg.", +# "ved.", +# "vert.", +# "vet.", +# "vid.", +# "virš.", +# "vlsč.", +# "vnt.", +# "vok.", +# "vs.", +# "vtv.", +# "vv.", +# "vyr.", +# "vyresn.", +# "zool.", +# "Įn", +# "įl.", +# "š.m.", +# "šnek.", +# "šv.", +# "švč.", +# "ž.ū.", +# "žin.", +# "žml.", +# "žr.", ]: _exc[orth] = [{ORTH: orth}] diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index cac32aa4d..bb9c75383 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -15,11 +15,11 @@ def test_lt_tokenizer_handles_long_text(lt_tokenizer): [ ( "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", - 15, + 17, ), ( "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", - 16, + 18, ), ], ) @@ -31,7 +31,7 @@ def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length): @pytest.mark.parametrize("text", ["km.", "pvz.", "biol."]) def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text): tokens = lt_tokenizer(text) - assert len(tokens) == 1 + assert len(tokens) == 2 @pytest.mark.parametrize( From b71dd44dbcfb6f4aa78034b4419c793972c77e62 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 25 Mar 2020 11:28:19 +0100 Subject: [PATCH 035/105] Improved Romanian tokenization for UD RRT (#5206) Modifications to Romanian tokenization to improve tokenization for UD_Romanian-RRT. From 828acffc12d6e57f48c345196e79ffa1fb917419 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Mar 2020 12:28:12 +0100 Subject: [PATCH 036/105] Tidy up and auto-format --- spacy/cli/train.py | 28 +- spacy/displacy/__init__.py | 11 +- spacy/displacy/render.py | 21 +- spacy/errors.py | 1 - spacy/lang/de/punctuation.py | 2 +- spacy/lang/eu/examples.py | 2 +- spacy/lang/eu/lex_attrs.py | 1 - spacy/lang/eu/stop_words.py | 2 +- spacy/lang/fr/punctuation.py | 8 +- spacy/lang/fr/tokenizer_exceptions.py | 8 +- spacy/lang/it/punctuation.py | 13 +- spacy/lang/it/tokenizer_exceptions.py | 4 +- spacy/lang/lij/stop_words.py | 2 +- spacy/lang/lij/tokenizer_exceptions.py | 2 +- spacy/lang/lt/__init__.py | 4 +- spacy/lang/lt/tokenizer_exceptions.py | 512 +-- spacy/lang/nb/punctuation.py | 1 - spacy/lang/pt/tokenizer_exceptions.py | 2 +- spacy/lang/sk/tag_map.py | 2918 ++++++++--------- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/language.py | 11 +- spacy/pipeline/entityruler.py | 6 +- spacy/tests/doc/test_array.py | 2 - spacy/tests/lang/da/test_exceptions.py | 2 +- spacy/tests/lang/eu/test_text.py | 8 +- spacy/tests/lang/hu/test_tokenizer.py | 16 +- spacy/tests/matcher/test_matcher_api.py | 3 +- spacy/tests/pipeline/test_entity_ruler.py | 7 +- spacy/tests/regression/test_issue4725.py | 1 - spacy/tests/regression/test_issue4849.py | 13 +- .../serialize/test_serialize_tokenizer.py | 2 +- spacy/tests/util.py | 6 +- 32 files changed, 1828 insertions(+), 1793 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 59b0f2225..6408a6024 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -225,7 +225,9 @@ def train( exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) - disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) + disabled_pipes = nlp.disable_pipes( + [p for p in nlp.pipe_names if p not in pipeline] + ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) @@ -415,10 +417,10 @@ def train( losses=losses, ) except ValueError as e: - msg.warn("Error during training") + err = "Error during training" if init_tok2vec: - msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?") - msg.fail("Original error message: {}".format(e), exits=1) + err += " Did you provide the same parameters during 'train' as during 'pretrain'?" + msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. @@ -546,7 +548,10 @@ def train( ) break except Exception as e: - msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e)) + msg.warn( + "Aborting and saving the final best model. " + "Encountered exception: {}".format(e) + ) finally: best_pipes = nlp.pipe_names if disabled_pipes: @@ -563,13 +568,20 @@ def train( final_meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: - speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) + speed = _get_total_speed( + [final_meta["speed"]["cpu"], meta["speed"]["cpu"]] + ) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: - speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) + speed = _get_total_speed( + [final_meta["speed"]["gpu"], meta["speed"]["gpu"]] + ) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta - if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None: + if ( + final_meta["speed"]["cpu"] is None + and final_meta["speed"]["gpu"] is None + ): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index e13b0403b..922d80e57 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -146,9 +146,14 @@ def parse_deps(orig_doc, options={}): retokenizer.merge(span, attrs=attrs) fine_grained = options.get("fine_grained") add_lemma = options.get("add_lemma") - words = [{"text": w.text, - "tag": w.tag_ if fine_grained else w.pos_, - "lemma": w.lemma_ if add_lemma else None} for w in doc] + words = [ + { + "text": w.text, + "tag": w.tag_ if fine_grained else w.pos_, + "lemma": w.lemma_ if add_lemma else None, + } + for w in doc + ] arcs = [] for word in doc: diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 68df324d6..57d67c96b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,7 +3,13 @@ from __future__ import unicode_literals import uuid -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS +from .templates import ( + TPL_DEP_SVG, + TPL_DEP_WORDS, + TPL_DEP_WORDS_LEMMA, + TPL_DEP_ARCS, + TPL_ENTS, +) from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from ..util import minify_html, escape_html, registry from ..errors import Errors @@ -83,7 +89,10 @@ class DependencyRenderer(object): self.width = self.offset_x + len(words) * self.distance self.height = self.offset_y + 3 * self.word_spacing self.id = render_id - words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)] + words = [ + self.render_word(w["text"], w["tag"], w.get("lemma", None), i) + for i, w in enumerate(words) + ] arcs = [ self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i) for i, a in enumerate(arcs) @@ -101,7 +110,9 @@ class DependencyRenderer(object): lang=self.lang, ) - def render_word(self, text, tag, lemma, i,): + def render_word( + self, text, tag, lemma, i, + ): """Render individual word. text (unicode): Word text. @@ -115,7 +126,9 @@ class DependencyRenderer(object): x = self.width - x html_text = escape_html(text) if lemma is not None: - return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y) + return TPL_DEP_WORDS_LEMMA.format( + text=html_text, tag=tag, lemma=lemma, x=x, y=y + ) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) def render_arrow(self, label, start, end, direction, i): diff --git a/spacy/errors.py b/spacy/errors.py index b43b8487f..c751ad65a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -112,7 +112,6 @@ class Warnings(object): "in problems with the vocab further on in the pipeline.") - @add_codes class Errors(object): E001 = ("No component '{name}' found in pipeline. Available names: {opts}") diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index da6ab1d40..93454ffff 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES -from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT +from ..char_classes import CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py index f2d325d78..463494abd 100644 --- a/spacy/lang/eu/examples.py +++ b/spacy/lang/eu/examples.py @@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models. sentences = [ "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", - "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira" + "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira", ] diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py index c11e913db..19b75c111 100644 --- a/spacy/lang/eu/lex_attrs.py +++ b/spacy/lang/eu/lex_attrs.py @@ -59,7 +59,6 @@ behin """.split() - def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py index 208238961..dda11a7fd 100644 --- a/spacy/lang/eu/stop_words.py +++ b/spacy/lang/eu/stop_words.py @@ -5,7 +5,7 @@ from __future__ import unicode_literals # https://www.ranks.nl/stopwords/basque # https://www.mustgo.com/worldlanguages/basque/ STOP_WORDS = set( -""" + """ al anitz arabera diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index e03e91361..7d50c4a9e 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -16,7 +16,9 @@ _hyphen_suffixes += " " + _hyphen_suffixes.upper() _prefixes = TOKENIZER_PREFIXES + [ - r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)) + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) ] _suffixes = ( @@ -33,7 +35,9 @@ _suffixes = ( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), - r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)), + r"(?<=[{a}])[{h}]({hs})".format( + a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes) + ), ] ) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 56c5544a5..dfcb2756e 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -6,10 +6,10 @@ import re from .punctuation import ELISION, HYPHENS from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA -from ...symbols import ORTH, LEMMA, TAG +from ...symbols import ORTH, LEMMA # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer -#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS +# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"] @@ -93,7 +93,7 @@ for verb, verb_lemma in [ for pronoun in ["elle", "il", "on"]: token = "{}-t-{}".format(orth, pronoun) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, {LEMMA: pronoun, ORTH: "-" + pronoun}, ] @@ -102,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: token = "{}-ce".format(orth) _exc[token] = [ - {LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, + {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, ] diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index f2c1fd84a..1d641f144 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES from ..char_classes import ALPHA_LOWER, ALPHA_UPPER @@ -10,14 +10,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA_UPPER ELISION = "'’" -_prefixes = ( - [ - r"'[0-9][0-9]", - r"[0-9]+°", - - ] - + TOKENIZER_PREFIXES -) +_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES _infixes = ( @@ -31,7 +24,7 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) + r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION), ] ) diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 70dfe92bd..70519ba6a 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -10,7 +10,7 @@ _exc = { "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], "po'": [{ORTH: "po'", LEMMA: "poco"}], - "sett..": [{ORTH: "sett."}, {ORTH: "."}] + "sett..": [{ORTH: "sett."}, {ORTH: "."}], } for orth in [ @@ -32,7 +32,7 @@ for orth in [ "col.", "Cost.", "d.C.", - 'de"' + 'de"', "distr.", "E'", "ecc.", diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index 7ab34adf1..ffd53370d 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -8,7 +8,7 @@ a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri a bella belle belli bello ben -ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse +ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py index 2aa6f8304..2109add62 100644 --- a/spacy/lang/lij/tokenizer_exceptions.py +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -44,7 +44,7 @@ for prep, prep_lemma in [ ("s’", "se"), ]: for prefix_orth in [prefix, prefix.capitalize()]: - _exc[prefix_orth+prep] = [ + _exc[prefix_orth + prep] = [ {ORTH: prefix_orth, LEMMA: prefix_lemma}, {ORTH: prep, LEMMA: prep_lemma}, ] diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 1dfe932ee..ce2c8d6a4 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -29,7 +29,9 @@ class LithuanianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")} + mod_base_exceptions = { + exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") + } del mod_base_exceptions["8)"] tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index f8e11156d..4287b26dd 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -8,262 +8,262 @@ _exc = {} for orth in [ "n-tosios", "?!", -# "G.", -# "J. E.", -# "J. Em.", -# "J.E.", -# "J.Em.", -# "K.", -# "N.", -# "V.", -# "Vt.", -# "a.", -# "a.k.", -# "a.s.", -# "adv.", -# "akad.", -# "aklg.", -# "akt.", -# "al.", -# "ang.", -# "angl.", -# "aps.", -# "apskr.", -# "apyg.", -# "arbat.", -# "asist.", -# "asm.", -# "asm.k.", -# "asmv.", -# "atk.", -# "atsak.", -# "atsisk.", -# "atsisk.sąsk.", -# "atv.", -# "aut.", -# "avd.", -# "b.k.", -# "baud.", -# "biol.", -# "bkl.", -# "bot.", -# "bt.", -# "buv.", -# "ch.", -# "chem.", -# "corp.", -# "d.", -# "dab.", -# "dail.", -# "dek.", -# "deš.", -# "dir.", -# "dirig.", -# "doc.", -# "dol.", -# "dr.", -# "drp.", -# "dvit.", -# "dėst.", -# "dš.", -# "dž.", -# "e.b.", -# "e.bankas", -# "e.p.", -# "e.parašas", -# "e.paštas", -# "e.v.", -# "e.valdžia", -# "egz.", -# "eil.", -# "ekon.", -# "el.", -# "el.bankas", -# "el.p.", -# "el.parašas", -# "el.paštas", -# "el.valdžia", -# "etc.", -# "ež.", -# "fak.", -# "faks.", -# "feat.", -# "filol.", -# "filos.", -# "g.", -# "gen.", -# "geol.", -# "gerb.", -# "gim.", -# "gr.", -# "gv.", -# "gyd.", -# "gyv.", -# "habil.", -# "inc.", -# "insp.", -# "inž.", -# "ir pan.", -# "ir t. t.", -# "isp.", -# "istor.", -# "it.", -# "just.", -# "k.", -# "k. a.", -# "k.a.", -# "kab.", -# "kand.", -# "kart.", -# "kat.", -# "ketv.", -# "kh.", -# "kl.", -# "kln.", -# "km.", -# "kn.", -# "koresp.", -# "kpt.", -# "kr.", -# "kt.", -# "kub.", -# "kun.", -# "kv.", -# "kyš.", -# "l. e. p.", -# "l.e.p.", -# "lenk.", -# "liet.", -# "lot.", -# "lt.", -# "ltd.", -# "ltn.", -# "m.", -# "m.e..", -# "m.m.", -# "mat.", -# "med.", -# "mgnt.", -# "mgr.", -# "min.", -# "mjr.", -# "ml.", -# "mln.", -# "mlrd.", -# "mob.", -# "mok.", -# "moksl.", -# "mokyt.", -# "mot.", -# "mr.", -# "mst.", -# "mstl.", -# "mėn.", -# "nkt.", -# "no.", -# "nr.", -# "ntk.", -# "nuotr.", -# "op.", -# "org.", -# "orig.", -# "p.", -# "p.d.", -# "p.m.e.", -# "p.s.", -# "pab.", -# "pan.", -# "past.", -# "pav.", -# "pavad.", -# "per.", -# "perd.", -# "pirm.", -# "pl.", -# "plg.", -# "plk.", -# "pr.", -# "pr.Kr.", -# "pranc.", -# "proc.", -# "prof.", -# "prom.", -# "prot.", -# "psl.", -# "pss.", -# "pvz.", -# "pšt.", -# "r.", -# "raj.", -# "red.", -# "rez.", -# "rež.", -# "rus.", -# "rš.", -# "s.", -# "sav.", -# "saviv.", -# "sek.", -# "sekr.", -# "sen.", -# "sh.", -# "sk.", -# "skg.", -# "skv.", -# "skyr.", -# "sp.", -# "spec.", -# "sr.", -# "st.", -# "str.", -# "stud.", -# "sąs.", -# "t.", -# "t. p.", -# "t. y.", -# "t.p.", -# "t.t.", -# "t.y.", -# "techn.", -# "tel.", -# "teol.", -# "th.", -# "tir.", -# "trit.", -# "trln.", -# "tšk.", -# "tūks.", -# "tūkst.", -# "up.", -# "upl.", -# "v.s.", -# "vad.", -# "val.", -# "valg.", -# "ved.", -# "vert.", -# "vet.", -# "vid.", -# "virš.", -# "vlsč.", -# "vnt.", -# "vok.", -# "vs.", -# "vtv.", -# "vv.", -# "vyr.", -# "vyresn.", -# "zool.", -# "Įn", -# "įl.", -# "š.m.", -# "šnek.", -# "šv.", -# "švč.", -# "ž.ū.", -# "žin.", -# "žml.", -# "žr.", + # "G.", + # "J. E.", + # "J. Em.", + # "J.E.", + # "J.Em.", + # "K.", + # "N.", + # "V.", + # "Vt.", + # "a.", + # "a.k.", + # "a.s.", + # "adv.", + # "akad.", + # "aklg.", + # "akt.", + # "al.", + # "ang.", + # "angl.", + # "aps.", + # "apskr.", + # "apyg.", + # "arbat.", + # "asist.", + # "asm.", + # "asm.k.", + # "asmv.", + # "atk.", + # "atsak.", + # "atsisk.", + # "atsisk.sąsk.", + # "atv.", + # "aut.", + # "avd.", + # "b.k.", + # "baud.", + # "biol.", + # "bkl.", + # "bot.", + # "bt.", + # "buv.", + # "ch.", + # "chem.", + # "corp.", + # "d.", + # "dab.", + # "dail.", + # "dek.", + # "deš.", + # "dir.", + # "dirig.", + # "doc.", + # "dol.", + # "dr.", + # "drp.", + # "dvit.", + # "dėst.", + # "dš.", + # "dž.", + # "e.b.", + # "e.bankas", + # "e.p.", + # "e.parašas", + # "e.paštas", + # "e.v.", + # "e.valdžia", + # "egz.", + # "eil.", + # "ekon.", + # "el.", + # "el.bankas", + # "el.p.", + # "el.parašas", + # "el.paštas", + # "el.valdžia", + # "etc.", + # "ež.", + # "fak.", + # "faks.", + # "feat.", + # "filol.", + # "filos.", + # "g.", + # "gen.", + # "geol.", + # "gerb.", + # "gim.", + # "gr.", + # "gv.", + # "gyd.", + # "gyv.", + # "habil.", + # "inc.", + # "insp.", + # "inž.", + # "ir pan.", + # "ir t. t.", + # "isp.", + # "istor.", + # "it.", + # "just.", + # "k.", + # "k. a.", + # "k.a.", + # "kab.", + # "kand.", + # "kart.", + # "kat.", + # "ketv.", + # "kh.", + # "kl.", + # "kln.", + # "km.", + # "kn.", + # "koresp.", + # "kpt.", + # "kr.", + # "kt.", + # "kub.", + # "kun.", + # "kv.", + # "kyš.", + # "l. e. p.", + # "l.e.p.", + # "lenk.", + # "liet.", + # "lot.", + # "lt.", + # "ltd.", + # "ltn.", + # "m.", + # "m.e..", + # "m.m.", + # "mat.", + # "med.", + # "mgnt.", + # "mgr.", + # "min.", + # "mjr.", + # "ml.", + # "mln.", + # "mlrd.", + # "mob.", + # "mok.", + # "moksl.", + # "mokyt.", + # "mot.", + # "mr.", + # "mst.", + # "mstl.", + # "mėn.", + # "nkt.", + # "no.", + # "nr.", + # "ntk.", + # "nuotr.", + # "op.", + # "org.", + # "orig.", + # "p.", + # "p.d.", + # "p.m.e.", + # "p.s.", + # "pab.", + # "pan.", + # "past.", + # "pav.", + # "pavad.", + # "per.", + # "perd.", + # "pirm.", + # "pl.", + # "plg.", + # "plk.", + # "pr.", + # "pr.Kr.", + # "pranc.", + # "proc.", + # "prof.", + # "prom.", + # "prot.", + # "psl.", + # "pss.", + # "pvz.", + # "pšt.", + # "r.", + # "raj.", + # "red.", + # "rez.", + # "rež.", + # "rus.", + # "rš.", + # "s.", + # "sav.", + # "saviv.", + # "sek.", + # "sekr.", + # "sen.", + # "sh.", + # "sk.", + # "skg.", + # "skv.", + # "skyr.", + # "sp.", + # "spec.", + # "sr.", + # "st.", + # "str.", + # "stud.", + # "sąs.", + # "t.", + # "t. p.", + # "t. y.", + # "t.p.", + # "t.t.", + # "t.y.", + # "techn.", + # "tel.", + # "teol.", + # "th.", + # "tir.", + # "trit.", + # "trln.", + # "tšk.", + # "tūks.", + # "tūkst.", + # "up.", + # "upl.", + # "v.s.", + # "vad.", + # "val.", + # "valg.", + # "ved.", + # "vert.", + # "vet.", + # "vid.", + # "virš.", + # "vlsč.", + # "vnt.", + # "vok.", + # "vs.", + # "vtv.", + # "vv.", + # "vyr.", + # "vyresn.", + # "zool.", + # "Įn", + # "įl.", + # "š.m.", + # "šnek.", + # "šv.", + # "švč.", + # "ž.ū.", + # "žin.", + # "žml.", + # "žr.", ]: _exc[orth] = [{ORTH: orth}] diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index 7672809ec..4c10b5a68 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -24,7 +24,6 @@ _prefixes = ( ) - _infixes = ( LIST_ELLIPSES + _list_icons diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index c36af6771..981c0624b 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, NORM +from ...symbols import ORTH _exc = {} diff --git a/spacy/lang/sk/tag_map.py b/spacy/lang/sk/tag_map.py index 015c8cba3..28b36d3c1 100644 --- a/spacy/lang/sk/tag_map.py +++ b/spacy/lang/sk/tag_map.py @@ -1,1467 +1,1467 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON +from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB +from ...symbols import NOUN, PART, INTJ, PRON # Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html # fmt: off TAG_MAP = { - "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "Dx": {POS: ADV, "morph": "Degree=Pos"}, - "Dy": {POS: ADV, "morph": "Degree=Cmp"}, - "Dz": {POS: ADV, "morph": "Degree=Sup"}, - "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, - "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, - "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, - "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, - "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, - "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, - "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, - "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, - "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, - "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, - "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, - "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "J": {POS: INTJ, "morph": "_"}, - "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "ND": {POS: NUM, "morph": "MorphPos=Adv"}, - "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "O": {POS: CCONJ, "morph": "_"}, - "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, - "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, - "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "Q": {POS: X, "morph": "Hyph=Yes"}, - "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, - "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "T": {POS: PART, "morph": "_"}, - "TY": {POS: PART, "morph": "Mood=Cnd"}, - "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, - "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, - "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, - "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, - "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, - "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, - "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, - "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, - "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, - "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, - "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, - "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, - "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "W": {POS: X, "morph": "Abbr=Yes"}, - "Y": {POS: AUX, "morph": "Mood=Cnd"}, + "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "Dx": {POS: ADV, "morph": "Degree=Pos"}, + "Dy": {POS: ADV, "morph": "Degree=Cmp"}, + "Dz": {POS: ADV, "morph": "Degree=Sup"}, + "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, + "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, + "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, + "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, + "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, + "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, + "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, + "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, + "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, + "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, + "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, + "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "J": {POS: INTJ, "morph": "_"}, + "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "ND": {POS: NUM, "morph": "MorphPos=Adv"}, + "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "O": {POS: CCONJ, "morph": "_"}, + "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, + "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, + "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "Q": {POS: X, "morph": "Hyph=Yes"}, + "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, + "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "T": {POS: PART, "morph": "_"}, + "TY": {POS: PART, "morph": "Mood=Cnd"}, + "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, + "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, + "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, + "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, + "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, + "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, + "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, + "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, + "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, + "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, + "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, + "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, + "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "W": {POS: X, "morph": "Abbr=Yes"}, + "Y": {POS: AUX, "morph": "Mood=Cnd"}, } diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 2c0fc9cf7..385afb8bd 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -37,7 +37,7 @@ URL_PATTERN = ( r"|" # host & domain names # mods: match is case-sensitive, so include [A-Z] - "(?:" + "(?:" # noqa "(?:" "[A-Za-z0-9\u00a1-\uffff]" "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" diff --git a/spacy/language.py b/spacy/language.py index f0928b1f9..56619080d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -612,7 +612,7 @@ class Language(object): link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: cfg["pretrained_vectors"] = self.vocab.vectors.name - cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] + cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1] if sgd is None: sgd = create_default_optimizer(Model.ops) self._optimizer = sgd @@ -857,7 +857,14 @@ class Language(object): procs = [ mp.Process( target=_apply_pipes, - args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS), + args=( + self.make_doc, + pipes, + rch, + sch, + Underscore.get_state(), + load_nlp.VECTORS, + ), ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index c3ef429e9..1786dda87 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -222,11 +222,9 @@ class EntityRuler(object): for label, pattern, ent_id in zip( phrase_pattern_labels, self.nlp.pipe(phrase_pattern_texts), - phrase_pattern_ids + phrase_pattern_ids, ): - phrase_pattern = { - "label": label, "pattern": pattern, "id": ent_id - } + phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id} if ent_id: phrase_pattern["id"] = ent_id phrase_patterns.append(phrase_pattern) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 1c0c79f6e..09a6f9c4b 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -71,9 +71,7 @@ def test_doc_array_to_from_string_attrs(en_vocab, attrs): def test_doc_array_idx(en_vocab): """Test that Doc.to_array can retrieve token start indices""" words = ["An", "example", "sentence"] - doc = Doc(en_vocab, words=words) offsets = Doc(en_vocab, words=words).to_array("IDX") - assert offsets[0] == 0 assert offsets[1] == 3 assert offsets[2] == 11 diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index f98030621..837ceb323 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -59,7 +59,7 @@ def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): ("Sprogteknologi a/s", 2), ("De boede i A/B Bellevue", 5), # note: skipping due to weirdness in UD_Danish-DDT - #("Rotorhastigheden er 3400 o/m.", 5), + # ("Rotorhastigheden er 3400 o/m.", 5), ("Jeg købte billet t/r.", 5), ("Murerarbejdsmand m/k søges", 3), ("Netværket kører over TCP/IP", 4), diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py index e73917ffa..f448a7859 100644 --- a/spacy/tests/lang/eu/test_text.py +++ b/spacy/tests/lang/eu/test_text.py @@ -10,7 +10,13 @@ def test_eu_tokenizer_handles_long_text(eu_tokenizer): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)]) +@pytest.mark.parametrize( + "text,length", + [ + ("milesker ederra joan zen hitzaldia plazer hutsa", 7), + ("astelehen guztia sofan pasau biot", 5), + ], +) def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): tokens = eu_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index 2fceece49..1ac6bfc76 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -297,12 +297,7 @@ WIKI_TESTS = [ ] EXTRA_TESTS = ( - DOT_TESTS - + QUOTE_TESTS - + NUMBER_TESTS - + HYPHEN_TESTS - + WIKI_TESTS - + TYPO_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS ) # normal: default tests + 10% of extra tests @@ -311,7 +306,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0]) # slow: remaining 90% of extra tests SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] -TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS]) +TESTS.extend( + [ + pytest.param(x[0], x[1], marks=pytest.mark.slow()) + if not isinstance(x[0], tuple) + else x + for x in SLOW_TESTS + ] +) @pytest.mark.parametrize("text,expected_tokens", TESTS) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index a826a0a0e..c0314f3c3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,7 +6,8 @@ import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token -from ..doc.test_underscore import clean_underscore + +from ..doc.test_underscore import clean_underscore # noqa: F401 @pytest.fixture diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 3b46baa9b..b6e3c40c9 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -152,10 +152,5 @@ def test_entity_ruler_validate(nlp): def test_entity_ruler_properties(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - assert sorted(ruler.labels) == sorted([ - "HELLO", - "BYE", - "COMPLEX", - "TECH_ORG" - ]) + assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) assert sorted(ruler.ent_ids) == ["a1", "a2"] diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index f80f19852..57675a202 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -23,4 +23,3 @@ def test_issue4725(): docs = ["Kurt is in London."] * 10 for _ in nlp.pipe(docs, batch_size=2, n_process=2): pass - diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 834219773..5c7ffc999 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -9,11 +9,12 @@ def test_issue4849(): nlp = English() ruler = EntityRuler( - nlp, patterns=[ - {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, - {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'}, + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, ], - phrase_matcher_attr="LOWER" + phrase_matcher_attr="LOWER", ) nlp.add_pipe(ruler) @@ -27,10 +28,10 @@ def test_issue4849(): count_ents = 0 for doc in nlp.pipe([text], n_process=1): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert(count_ents == 2) + assert count_ents == 2 # USING 2 PROCESSES count_ents = 0 for doc in nlp.pipe([text], n_process=2): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert (count_ents == 2) + assert count_ents == 2 diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 0e0816a55..cbe119225 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -22,7 +22,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) - tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]}) + tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]}) tokenizer.rules = {} tokenizer_bytes = tokenizer.to_bytes() tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index a0d6273a9..4e1c50398 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -28,7 +28,9 @@ def make_tempdir(): shutil.rmtree(path2str(d)) -def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): +def get_doc( + vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None +): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: heads = [0] * len(deps) @@ -60,7 +62,7 @@ def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=No if attrs.ndim == 1: attrs[i] = heads[i] else: - attrs[i,j] = heads[i] + attrs[i, j] = heads[i] else: for i in range(len(words)): if attrs.ndim == 1: From d88a377bed122018dd54b4228f48b73bee6881b1 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:45:47 +0100 Subject: [PATCH 037/105] Remove Vectors.from_glove (#5209) --- spacy/vectors.pyx | 38 ------------------------ website/docs/api/vectors.md | 19 ------------ website/docs/usage/vectors-similarity.md | 31 ------------------- 3 files changed, 88 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index c6526b89d..f8643640a 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -355,44 +355,6 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) - def from_glove(self, path): - """Load GloVe vectors from a directory. Assumes binary format, - that the vocab is in a vocab.txt, and that vectors are named - vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 - vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. - By default GloVe outputs 64-bit vectors. - - path (unicode / Path): The path to load the GloVe vectors from. - RETURNS: A `StringStore` object, holding the key-to-string mapping. - - DOCS: https://spacy.io/api/vectors#from_glove - """ - path = util.ensure_path(path) - width = None - for name in path.iterdir(): - if name.parts[-1].startswith("vectors"): - _, dims, dtype, _2 = name.parts[-1].split('.') - width = int(dims) - break - else: - raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) - xp = get_array_module(self.data) - self.data = None - with bin_loc.open("rb") as file_: - self.data = xp.fromfile(file_, dtype=dtype) - if dtype != "float32": - self.data = xp.ascontiguousarray(self.data, dtype="float32") - if self.data.ndim == 1: - self.data = self.data.reshape((self.data.size//width, width)) - n = 0 - strings = StringStore() - with (path / "vocab.txt").open("r") as file_: - for i, line in enumerate(file_): - key = strings.add(line.strip()) - self.add(key, row=i) - return strings - def to_disk(self, path, **kwargs): """Save the current state to a directory. diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 3588672db..93e747c1e 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -326,25 +326,6 @@ performed in chunks, to avoid consuming too much memory. You can set the | `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. | | **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. | -## Vectors.from_glove {#from_glove tag="method"} - -Load [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. -Assumes binary format, that the vocab is in a `vocab.txt`, and that vectors are -named `vectors.{size}.[fd.bin]`, e.g. `vectors.128.f.bin` for 128d float32 -vectors, `vectors.300.d.bin` for 300d float64 (double) vectors, etc. By default -GloVe outputs 64-bit vectors. - -> #### Example -> -> ```python -> vectors = Vectors() -> vectors.from_glove("/path/to/glove_vectors") -> ``` - -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------- | -| `path` | unicode / `Path` | The path to load the GloVe vectors from. | - ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index 93ba67704..9b65bb80a 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -177,37 +177,6 @@ for word, vector in vector_data.items(): vocab.set_vector(word, vector) ``` -### Loading GloVe vectors {#custom-loading-glove new="2"} - -spaCy comes with built-in support for loading -[GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. The -[`Vectors.from_glove`](/api/vectors#from_glove) method assumes a binary format, -the vocab provided in a `vocab.txt`, and the naming scheme of -`vectors.{size}.[fd`.bin]. For example: - -```yaml -### Directory structure -└── vectors - ├── vectors.128.f.bin # vectors file - └── vocab.txt # vocabulary -``` - -| File name | Dimensions | Data type | -| ------------------- | ---------- | ---------------- | -| `vectors.128.f.bin` | 128 | float32 | -| `vectors.300.d.bin` | 300 | float64 (double) | - -```python -nlp = spacy.load("en_core_web_sm") -nlp.vocab.vectors.from_glove("/path/to/vectors") -``` - -If your instance of `Language` already contains vectors, they will be -overwritten. To create your own GloVe vectors model package like spaCy's -[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call -[`nlp.to_disk`](/api/language#to_disk), and then package the model using the -[`package`](/api/cli#package) command. - ### Using custom similarity methods {#custom-similarity} By default, [`Token.vector`](/api/token#vector) returns the vector for its From a04f8020993568e5677cdbce96e93c82cf6e012f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:46:23 +0100 Subject: [PATCH 038/105] Fix GoldParse init when token count differs (#5191) Fix the `GoldParse` initialization when the number of tokens has changed (due to merging subtokens with the parser). --- spacy/scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 7b05b11fd..25c660240 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -231,7 +231,7 @@ class Scorer(object): """ if len(doc) != len(gold): gold = GoldParse.from_annot_tuples( - doc, tuple(zip(*gold.orig_annot)) + (gold.cats,) + doc, zip(*gold.orig_annot), cats=gold.cats, ) gold_deps = set() gold_deps_per_dep = {} From 8d3563f1c463852758a8fb323e8ddc7aa73b81bc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:46:50 +0100 Subject: [PATCH 039/105] Minor bugfixes for train CLI (#5186) * Omit per_type scores from model-best calculations The addition of per_type scores to the included metrics (#4911) causes errors when they're compared while determining the best model, so omit them for this `max()` comparison. * Add default speed data for interrupted train CLI Add better speed meta defaults so that an interrupted iteration still produces a best model. Co-authored-by: Ines Montani --- spacy/cli/train.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6408a6024..c94c26b62 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -566,6 +566,9 @@ def train( final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) + meta.setdefault("speed", {}) + meta["speed"].setdefault("cpu", None) + meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( @@ -673,6 +676,8 @@ def _find_best(experiment_dir, component): if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": accs = srsly.read_json(epoch_model / "accuracy.json") scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] + # remove per_type dicts from score list for max() comparison + scores = [score for score in scores if isinstance(score, float)] accuracies.append((scores, epoch_model)) if accuracies: return max(accuracies)[1] From e53232533b788bb303108f07443b37529051ef14 Mon Sep 17 00:00:00 2001 From: Tiljander <35637838+Tiljander@users.noreply.github.com> Date: Thu, 26 Mar 2020 13:13:22 +0100 Subject: [PATCH 040/105] Describing priority rules for overlapping matches (#5197) * Describing priority rules for overlapping matches * Create Tiljander.md * Describing priority rules for overlapping matches * Update website/docs/api/entityruler.md Co-Authored-By: Ines Montani Co-authored-by: Ines Montani --- .github/contributors/Tiljander.md | 106 ++++++++++++++++++++++ website/docs/api/entityruler.md | 3 +- website/docs/usage/rule-based-matching.md | 5 +- 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/Tiljander.md diff --git a/.github/contributors/Tiljander.md b/.github/contributors/Tiljander.md new file mode 100644 index 000000000..89e70efa5 --- /dev/null +++ b/.github/contributors/Tiljander.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Henrik Tiljander | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 24/3/2020 | +| GitHub username | Tiljander | +| Website (optional) | | diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index af3db0dcb..0fd24897d 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -83,7 +83,8 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. +with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer +patterns over shorter, and if equal the match occuring first in the Doc is chosen. > #### Example > diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 0ab74034e..1db2405d1 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -968,7 +968,10 @@ pattern. The entity ruler accepts two types of patterns: The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a text, it will find matches in the `doc` and add them as entities to -the `doc.ents`, using the specified pattern label as the entity label. +the `doc.ents`, using the specified pattern label as the entity label. If any +matches were to overlap, the pattern matching most tokens takes priority. If +they also happen to be equally long, then the match occuring first in the Doc is +chosen. ```python ### {executable="true"} From d1ddfa1cb736f4a52d8073e99289d009bf7d5ad9 Mon Sep 17 00:00:00 2001 From: Nikhil Saldanha Date: Sat, 28 Mar 2020 18:13:02 +0100 Subject: [PATCH 041/105] update docs for EntityRecognizer.predict return type was wrongly written as a tuple, changed to syntax.StateClass --- website/docs/api/entityrecognizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 9a2766c07..9345ee249 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -105,7 +105,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. | Name | Type | Description | | ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. | +| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | ## EntityRecognizer.set_annotations {#set_annotations tag="method"} From be6d10517fd7059765c73ec30b5dc96382fbd786 Mon Sep 17 00:00:00 2001 From: Nikhil Saldanha Date: Sat, 28 Mar 2020 18:36:55 +0100 Subject: [PATCH 042/105] sign contributor agreement --- .github/contributors/nikhilsaldanha.md | 106 +++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/nikhilsaldanha.md diff --git a/.github/contributors/nikhilsaldanha.md b/.github/contributors/nikhilsaldanha.md new file mode 100644 index 000000000..76b60beb6 --- /dev/null +++ b/.github/contributors/nikhilsaldanha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Nikhil Saldanha | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020/03/28 | +| GitHub username | nikhilsaldanha | +| Website (optional) | | From 963bd890c1d3aa874b6da194c9b5316cffbce341 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 29 Mar 2020 13:51:20 +0200 Subject: [PATCH 043/105] Modify Vector.resize to work with cupy and improve resizing (#5216) * Modify Vector.resize to work with cupy Modify `Vectors.resize` to work with cupy. Modify behavior when resizing to a different vector dimension so that individual vectors are truncated or extended with zeros instead of having the original values filled into the new shape without regard for the original axes. * Update spacy/tests/vocab_vectors/test_vectors.py Co-Authored-By: Matthew Honnibal Co-authored-by: Matthew Honnibal --- spacy/errors.py | 1 + spacy/tests/vocab_vectors/test_vectors.py | 25 ++++++++++++++++------- spacy/vectors.pyx | 12 ++++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c751ad65a..b124fc88c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -551,6 +551,7 @@ class Errors(object): "array.") E191 = ("Invalid head: the head token must be from the same doc as the " "token itself.") + E192 = ("Unable to resize vectors in place with cupy.") @add_codes diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index b688ab9dd..8987b7c89 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -89,17 +89,28 @@ def test_init_vectors_with_resize_data(data, resize_data): assert v.shape != data.shape -def test_get_vector_resize(strings, data, resize_data): - v = Vectors(data=data) - v.resize(shape=resize_data.shape) +def test_get_vector_resize(strings, data): strings = [hash_string(s) for s in strings] + + # decrease vector dimension (truncate) + v = Vectors(data=data) + resized_dim = v.shape[1] - 1 + v.resize(shape=(v.shape[0], resized_dim)) for i, string in enumerate(strings): v.add(string, row=i) - assert list(v[strings[0]]) == list(resize_data[0]) - assert list(v[strings[0]]) != list(resize_data[1]) - assert list(v[strings[1]]) != list(resize_data[0]) - assert list(v[strings[1]]) == list(resize_data[1]) + assert list(v[strings[0]]) == list(data[0, :resized_dim]) + assert list(v[strings[1]]) == list(data[1, :resized_dim]) + + # increase vector dimension (pad with zeros) + v = Vectors(data=data) + resized_dim = v.shape[1] + 1 + v.resize(shape=(v.shape[0], resized_dim)) + for i, string in enumerate(strings): + v.add(string, row=i) + + assert list(v[strings[0]]) == list(data[0]) + [0] + assert list(v[strings[1]]) == list(data[1]) + [0] def test_init_vectors_with_data(strings, data): diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f8643640a..5b8512970 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -198,11 +198,17 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#resize """ + xp = get_array_module(self.data) if inplace: - self.data.resize(shape, refcheck=False) + if xp == numpy: + self.data.resize(shape, refcheck=False) + else: + raise ValueError(Errors.E192) else: - xp = get_array_module(self.data) - self.data = xp.resize(self.data, shape) + resized_array = xp.zeros(shape, dtype=self.data.dtype) + copy_shape = (min(shape[0], self.data.shape[0]), min(shape[1], self.data.shape[1])) + resized_array[:copy_shape[0], :copy_shape[1]] = self.data[:copy_shape[0], :copy_shape[1]] + self.data = resized_array filled = {row for row in self.key2row.values()} self._unset = cppset[int]({row for row in range(shape[0]) if row not in filled}) removed_items = [] From e9049581159849bd4a710b9196cb0b78d5cf9dac Mon Sep 17 00:00:00 2001 From: Tom Milligan Date: Sun, 29 Mar 2020 12:52:08 +0100 Subject: [PATCH 044/105] Limit to cupy-cuda v8, so as not to pull in v9 automatically. (#5194) --- .github/contributors/tommilligan.md | 106 ++++++++++++++++++++++++++++ setup.cfg | 12 ++-- 2 files changed, 112 insertions(+), 6 deletions(-) create mode 100644 .github/contributors/tommilligan.md diff --git a/.github/contributors/tommilligan.md b/.github/contributors/tommilligan.md new file mode 100644 index 000000000..475df5afa --- /dev/null +++ b/.github/contributors/tommilligan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, + object code, patch, tool, sample, graphic, specification, manual, + documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and + registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment + to any third party, you hereby grant to us a perpetual, irrevocable, + non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your + contribution. The rights that you grant to us under these terms are effective + on the date you first submitted a contribution to us, even if your submission + took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + - Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + - to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + - each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable + U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT + mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ------------ | +| Name | Tom Milligan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-24 | +| GitHub username | tommilligan | +| Website (optional) | | diff --git a/setup.cfg b/setup.cfg index e44e32bb2..465367ff6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,17 +61,17 @@ install_requires = lookups = spacy_lookups_data>=0.0.5,<0.2.0 cuda = - cupy>=5.0.0b4 + cupy>=5.0.0b4,<9.0.0 cuda80 = - cupy-cuda80>=5.0.0b4 + cupy-cuda80>=5.0.0b4,<9.0.0 cuda90 = - cupy-cuda90>=5.0.0b4 + cupy-cuda90>=5.0.0b4,<9.0.0 cuda91 = - cupy-cuda91>=5.0.0b4 + cupy-cuda91>=5.0.0b4,<9.0.0 cuda92 = - cupy-cuda92>=5.0.0b4 + cupy-cuda92>=5.0.0b4,<9.0.0 cuda100 = - cupy-cuda100>=5.0.0b4 + cupy-cuda100>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = fugashi>=0.1.3 From d47b810ba4f0e50ea5b377895974e0d3e3da828d Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 29 Mar 2020 13:52:34 +0200 Subject: [PATCH 045/105] Fix exclusive_classes in textcat ensemble (#5166) Pass the exclusive_classes setting to the bow model within the ensemble textcat model. --- spacy/_ml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index fb7d39255..ee7e59218 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -693,9 +693,11 @@ def build_text_classifier(nr_class, width=64, **cfg): ) linear_model = build_bow_text_classifier( - nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False + nr_class, + ngram_size=cfg.get("ngram_size", 1), + exclusive_classes=cfg.get("exclusive_classes", False), ) - if cfg.get("exclusive_classes"): + if cfg.get("exclusive_classes", False): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( From 4f27a24f5b78283435de85bca40b844c15b2cf4e Mon Sep 17 00:00:00 2001 From: Nikhil Saldanha Date: Sun, 29 Mar 2020 13:54:42 +0200 Subject: [PATCH 046/105] Add kannada examples (#5162) * Add example sentences for Kannada * sign contributor agreement --- .github/contributors/nikhilsaldanha.md | 106 +++++++++++++++++++++++++ spacy/lang/kn/examples.py | 22 +++++ 2 files changed, 128 insertions(+) create mode 100644 .github/contributors/nikhilsaldanha.md create mode 100644 spacy/lang/kn/examples.py diff --git a/.github/contributors/nikhilsaldanha.md b/.github/contributors/nikhilsaldanha.md new file mode 100644 index 000000000..f8d37d709 --- /dev/null +++ b/.github/contributors/nikhilsaldanha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Nikhil Saldanha | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-17 | +| GitHub username | nikhilsaldanha | +| Website (optional) | | diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py new file mode 100644 index 000000000..d82630432 --- /dev/null +++ b/spacy/lang/kn/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.", + "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.", + "ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್‌ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ​​ಪರಿಗಣಿಸುತ್ತದೆ.", + "ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್‌ಡಂನ ದೊಡ್ಡ ನಗರ.", + "ನೀನು ಎಲ್ಲಿದಿಯಾ?", + "ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?", + "ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?", + "ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?", +] From 0b76212831f8dad97af6a17d220d7dcdeb02aace Mon Sep 17 00:00:00 2001 From: Jacob Lauritzen Date: Thu, 2 Apr 2020 10:42:35 +0200 Subject: [PATCH 047/105] Extend and fix Danish examples (#5227) * Extend and fix Danish examples This PR fixes two examples, adds additional examples translated from the english version, and adds punctuation. The two changed examples are: * "fortov" changed to "fortovet", which is more [used](https://www.google.com/search?client=firefox-b-d&sxsrf=ALeKk0143gEuPe4IbIUpzBBt-oU10OMVqA%3A1585549036477&ei=7I6BXuvJHMGOrwSqi46oCQ&q=l%C3%B8behjul+p%C3%A5+fortov&oq=l%C3%B8behjul+p%C3%A5+fortov&gs_lcp=CgZwc3ktYWIQAzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQRzIECAAQR1DT8xZY0_MWYK_0FmgAcAZ4AIABAIgBAJIBAJgBAKABAaoBB2d3cy13aXo&sclient=psy-ab&ved=0ahUKEwjr7964xsHoAhVBx4sKHaqFA5UQ4dUDCAo&uact=5) and more natural. The Swedish and Norwegian examples also use this version of the word. * "stor by" changed to "storby". In Danish we have a specific noun to describe a large, metropolitan city which is different from just describing a city as "large". In this sentence it would be much more natural to describe London as a "storby". Google even correct as search for "London stor by" to "London storby". * Sign contrib agreement --- .github/contributors/jacse.md | 106 ++++++++++++++++++++++++++++++++++ spacy/lang/da/examples.py | 13 +++-- 2 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 .github/contributors/jacse.md diff --git a/.github/contributors/jacse.md b/.github/contributors/jacse.md new file mode 100644 index 000000000..7face10c3 --- /dev/null +++ b/.github/contributors/jacse.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jacob Lauritzen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-30 | +| GitHub username | jacse | +| Website (optional) | | diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index b535191a1..525c6519c 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -9,10 +9,13 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ - "Apple overvejer at købe et britisk startup for 1 milliard dollar", - "Selvkørende biler flytter forsikringsansvaret over på producenterne", - "San Francisco overvejer at forbyde udbringningsrobotter på fortov", - "London er en stor by i Storbritannien", + "Apple overvejer at købe et britisk startup for 1 milliard dollar.", + "Selvkørende biler flytter forsikringsansvaret over på producenterne.", + "San Francisco overvejer at forbyde udbringningsrobotter på fortovet.", + "London er en storby i Storbritannien.", + "Hvor er du?", + "Hvem er Frankrings president?", + "Hvad er hovedstaden i USA?", + "Hvornår blev Barack Obama født?", ] From d107afcffbf50aff63a7e15ecb3cf3f5a6fedbb7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 2 Apr 2020 10:43:13 +0200 Subject: [PATCH 048/105] Raise error for inplace resize with new vector dim (#5228) Raise an error if there is an attempt to resize the vectors in place with a different vector dimension. --- spacy/errors.py | 3 +++ spacy/vectors.pyx | 2 ++ 2 files changed, 5 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index b124fc88c..e0ddc86c5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -552,6 +552,9 @@ class Errors(object): E191 = ("Invalid head: the head token must be from the same doc as the " "token itself.") E192 = ("Unable to resize vectors in place with cupy.") + E193 = ("Unable to resize vectors in place if the resized vector dimension " + "({new_dim}) is not the same as the current vector dimension " + "({curr_dim}).") @add_codes diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 5b8512970..f3c20fb7f 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -200,6 +200,8 @@ cdef class Vectors: """ xp = get_array_module(self.data) if inplace: + if shape[1] != self.data.shape[1]: + raise ValueError(Errors.E193.format(new_dim=shape[1], curr_dim=self.data.shape[1])) if xp == numpy: self.data.resize(shape, refcheck=False) else: From 2b14997b68e2a737d9569926c3b13ee0870b4d76 Mon Sep 17 00:00:00 2001 From: Michael Leichtfried <22801077+leicmi@users.noreply.github.com> Date: Thu, 2 Apr 2020 14:47:42 +0200 Subject: [PATCH 049/105] Remove duplicated branch in if/else-if statement (#5234) * Remove duplicated branch in if-elif-statement * Add contributor agreement for leicmi --- .github/contributors/leicmi.md | 106 +++++++++++++++++++++++++++++++++ spacy/lemmatizer.py | 2 - 2 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/leicmi.md diff --git a/.github/contributors/leicmi.md b/.github/contributors/leicmi.md new file mode 100644 index 000000000..6a65a48f2 --- /dev/null +++ b/.github/contributors/leicmi.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Michael Leichtfried | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 30.03.2020 | +| GitHub username | leicmi | +| Website (optional) | | diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d70e4cfc4..33908eecf 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -98,8 +98,6 @@ class Lemmatizer(object): return True elif morphology.get("VerbForm") == "none": return True - elif morphology.get("VerbForm") == "inf": - return True elif morphology.get("Degree") == "pos": return True else: From 9cf965c26056065d6476b2a4336a42423bef3600 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 2 Apr 2020 15:04:15 +0200 Subject: [PATCH 050/105] avoid enumerate to avoid long waiting at 0% (#5159) --- .../wikipedia_processor.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 315b1e916..ed3c35c43 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard= if not labels_discard: labels_discard = [] - texts = [] - entities_list = [] + max_index = max(line_ids) - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): + with entity_file_path.open("r", encoding="utf8") as _file: + line = _file.readline() + i = 0 + while line and i < max_index: if i in line_ids: example = json.loads(line) article_id = example["article_id"] @@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard= if dev != is_dev(article_id) or not is_valid_article(clean_text): continue - texts.append(clean_text) - entities_list.append(entities) - - docs = nlp.pipe(texts, batch_size=50) - - for doc, entities in zip(docs, entities_list): - gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) - if gold and len(gold.links) > 0: - yield doc, gold + doc = nlp(clean_text) + gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) + if gold and len(gold.links) > 0: + yield doc, gold + i += 1 + line = _file.readline() def _get_gold_parse(doc, entities, dev, kb, labels_discard): From 11374208404531da28b4e17d561e821a99f542bd Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 3 Apr 2020 13:01:43 +0200 Subject: [PATCH 051/105] Small doc fixes (#5250) * fix link * torchtext instead tochtext --- website/docs/usage/linguistic-features.md | 2 +- website/meta/universe.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..59712939a 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1303,7 +1303,7 @@ with doc.retokenize() as retokenizer: ### Overwriting custom extension attributes {#retokenization-extensions} If you've registered custom -[extension attributes](/usage/processing-pipelines##custom-components-attributes), +[extension attributes](/usage/processing-pipelines#custom-components-attributes), you can overwrite them during tokenization by providing a dictionary of attribute names mapped to new values as the `"_"` key in the `attrs`. For merging, you need to provide one dictionary of attributes for the resulting diff --git a/website/meta/universe.json b/website/meta/universe.json index 23d052bb9..613648d8c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -641,7 +641,7 @@ "tags": ["chatbots"] }, { - "id": "tochtext", + "id": "torchtext", "title": "torchtext", "slogan": "Data loaders and abstractions for text and NLP", "github": "pytorch/text", From beef184e53f5fed4721a69190958a6b0b4cf6a89 Mon Sep 17 00:00:00 2001 From: YohannesDatasci <62481491+YohannesDatasci@users.noreply.github.com> Date: Fri, 3 Apr 2020 15:02:18 +0400 Subject: [PATCH 052/105] Armenian language support (#5246) * add Armenian language and test cases * agreement submission --- .github/contributors/YohannesDatasci.md | 106 + spacy/lang/hy/__init__.py | 25 + spacy/lang/hy/examples.py | 16 + spacy/lang/hy/lex_attrs.py | 58 + spacy/lang/hy/stop_words.py | 110 + spacy/lang/hy/tag_map.py | 2478 +++++++++++++++++++++++ spacy/tests/conftest.py | 4 + spacy/tests/lang/hy/test_text.py | 10 + spacy/tests/lang/hy/test_tokenizer.py | 47 + 9 files changed, 2854 insertions(+) create mode 100644 .github/contributors/YohannesDatasci.md create mode 100644 spacy/lang/hy/__init__.py create mode 100644 spacy/lang/hy/examples.py create mode 100644 spacy/lang/hy/lex_attrs.py create mode 100644 spacy/lang/hy/stop_words.py create mode 100644 spacy/lang/hy/tag_map.py create mode 100644 spacy/tests/lang/hy/test_text.py create mode 100644 spacy/tests/lang/hy/test_tokenizer.py diff --git a/.github/contributors/YohannesDatasci.md b/.github/contributors/YohannesDatasci.md new file mode 100644 index 000000000..129c45576 --- /dev/null +++ b/.github/contributors/YohannesDatasci.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Yohannes | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-02 | +| GitHub username | YohannesDatasci | +| Website (optional) | | \ No newline at end of file diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py new file mode 100644 index 000000000..3320edb6c --- /dev/null +++ b/spacy/lang/hy/__init__.py @@ -0,0 +1,25 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .tag_map import TAG_MAP + + +from ...attrs import LANG +from ...language import Language +from ...tokens import Doc + + +class ArmenianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "hy" + + lex_attr_getters.update(LEX_ATTRS) + stop_words = STOP_WORDS + tag_map = TAG_MAP + + +class Armenian(Language): + lang = "hy" + Defaults = ArmenianDefaults + + +__all__ = ["Armenian"] diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py new file mode 100644 index 000000000..b0df31aae --- /dev/null +++ b/spacy/lang/hy/examples.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.hy.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", + "Ո՞վ է Ֆրանսիայի նախագահը։", + "Որն է Միացյալ Նահանգների մայրաքաղաքը։", + "Ե՞րբ է ծնվել Բարաք Օբաման։", +] diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py new file mode 100644 index 000000000..7c1b9592f --- /dev/null +++ b/spacy/lang/hy/lex_attrs.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "զրօ", + "մէկ", + "երկու", + "երեք", + "չորս", + "հինգ", + "վեց", + "յոթ", + "ութ", + "ինը", + "տասը", + "տասնմեկ", + "տասներկու", + "տասն­երեք", + "տասն­չորս", + "տասն­հինգ", + "տասն­վեց", + "տասն­յոթ", + "տասն­ութ", + "տասն­ինը", + "քսան" "երեսուն", + "քառասուն", + "հիսուն", + "վաթցսուն", + "յոթանասուն", + "ութսուն", + "ինիսուն", + "հարյուր", + "հազար", + "միլիոն", + "միլիարդ", + "տրիլիոն", + "քվինտիլիոն", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py new file mode 100644 index 000000000..c671956a4 --- /dev/null +++ b/spacy/lang/hy/stop_words.py @@ -0,0 +1,110 @@ +from __future__ import unicode_literals + + +STOP_WORDS = set( + """ +նա +ողջը +այստեղ +ենք +նա +էիր +որպես +ուրիշ +բոլորը +այն +այլ +նույնչափ +էի +մի +և +ողջ +ես +ոմն +հետ +նրանք +ամենքը +ըստ +ինչ-ինչ +այսպես +համայն +մի +նաև +նույնքան +դա +ովևէ +համար +այնտեղ +էին +որոնք +սույն +ինչ-որ +ամենը +նույնպիսի +ու +իր +որոշ +միևնույն +ի +այնպիսի +մենք +ամեն ոք +նույն +երբևէ +այն +որևէ +ին +այդպես +նրա +որը +վրա +դու +էինք +այդպիսի +էիք +յուրաքանչյուրը +եմ +պիտի +այդ +ամբողջը +հետո +եք +ամեն +այլ +կամ +այսքան +որ +այնպես +այսինչ +բոլոր +է +մեկնումեկը +այդչափ +այնքան +ամբողջ +երբևիցե +այնչափ +ամենայն +մյուս +այնինչ +իսկ +այդտեղ +այս +սա +են +ամեն ինչ +որևիցե +ում +մեկը +այդ +դուք +այսչափ +այդքան +այսպիսի +էր +յուրաքանչյուր +այս +մեջ +թ +""".split() +) diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py new file mode 100644 index 000000000..90690c22e --- /dev/null +++ b/spacy/lang/hy/tag_map.py @@ -0,0 +1,2478 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN +from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ + +TAG_MAP = { + "ADJ_Abbr=Yes": {POS: ADJ, "Abbr": "Yes"}, + "ADJ_Degree=Pos|NumForm=Word|NumType=Ord": { + POS: ADJ, + "Degree": "Pos", + "NumForm": "Word", + "NumType": "Ord", + }, + "ADJ_Degree=Pos": {POS: ADJ, "Degree": "Pos"}, + "ADJ_Degree=Pos|Style=Coll": {POS: ADJ, "Degree": "Pos", "Style": "Coll"}, + "ADJ_Degree=Pos|Style=Expr": {POS: ADJ, "Degree": "Pos", "Style": "Expr"}, + "ADJ_Degree=Sup": {POS: ADJ, "Degree": "Sup"}, + "ADJ_NumForm=Digit|NumType=Ord": {POS: ADJ, "NumForm": "Digit", "NumType": "Ord"}, + "ADJ_NumForm=Word|NumType=Card": {POS: ADJ, "NumForm": "Word", "NumType": "Card"}, + "ADJ_NumForm=Word|NumType=Ord": {POS: ADJ, "NumForm": "Word", "NumType": "Ord"}, + "ADJ_Style=Coll": {POS: ADJ, "Style": "Coll"}, + "ADJ_Style=Expr": {POS: ADJ, "Style": "Expr"}, + "ADP_AdpType=Post|Case=Dat": {POS: ADP, "AdpType": "Post", "Case": "Dat"}, + "ADP_AdpType=Post|Case=Nom": {POS: ADP, "AdpType": "Post", "Case": "Nom"}, + "ADP_AdpType=Post|Number=Plur|Person=3": { + POS: ADP, + "AdpType": "Post", + "Number": "Plur", + "Person": "3", + }, + "ADP_AdpType=Post": {POS: ADP, "AdpType": "Post"}, + "ADP_AdpType=Prep": {POS: ADP, "AdpType": "Prep"}, + "ADP_AdpType=Prep|Style=Arch": {POS: ADP, "AdpType": "Prep", "Style": "Arch"}, + "ADV_Degree=Cmp": {POS: ADV, "Degree": "Cmp"}, + "ADV_Degree=Pos": {POS: ADV, "Degree": "Pos"}, + "ADV_Degree=Sup": {POS: ADV, "Degree": "Sup"}, + "ADV_Distance=Dist|PronType=Dem": {POS: ADV, "Distance": "Dist", "PronType": "Dem"}, + "ADV_Distance=Dist|PronType=Exc": {POS: ADV, "Distance": "Dist", "PronType": "Exc"}, + "ADV_Distance=Med|PronType=Dem": {POS: ADV, "Distance": "Med", "PronType": "Dem"}, + "ADV_Distance=Med|PronType=Dem|Style=Coll": { + POS: ADV, + "Distance": "Med", + "PronType": "Dem", + "Style": "Coll", + }, + "ADV_NumForm=Word|NumType=Card|PronType=Tot": { + POS: ADV, + "NumForm": "Word", + "NumType": "Card", + "PronType": "Tot", + }, + "ADV_PronType=Dem": {POS: ADV, "PronType": "Dem"}, + "ADV_PronType=Exc": {POS: ADV, "PronType": "Exc"}, + "ADV_PronType=Ind": {POS: ADV, "PronType": "Ind"}, + "ADV_PronType=Int": {POS: ADV, "PronType": "Int"}, + "ADV_PronType=Int|Style=Coll": {POS: ADV, "PronType": "Int", "Style": "Coll"}, + "ADV_PronType=Rel": {POS: ADV, "PronType": "Rel"}, + "ADV_Style=Coll": {POS: ADV, "Style": "Coll"}, + "ADV_Style=Rare": {POS: ADV, "Style": "Rare"}, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "1", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "2", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Imp|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Tense": "Imp", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin": { + POS: AUX, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Tense": "Pres", + "VerbForm": "Fin", + }, + "AUX_Aspect=Imp|VerbForm=Part": {POS: AUX, "Aspect": "Imp", "VerbForm": "Part"}, + "AUX_Aspect=Perf|VerbForm=Part": {POS: AUX, "Aspect": "Perf", "VerbForm": "Part"}, + "AUX_Aspect=Prosp|VerbForm=Part": {POS: AUX, "Aspect": "Prosp", "VerbForm": "Part"}, + "AUX_Polarity=Pos": {POS: AUX, "Polarity": "Pos"}, + "CCONJ_ConjType=Comp": {POS: CCONJ, "ConjType": "Comp"}, + "CCONJ_ConjType=Comp|Style=Coll": {POS: CCONJ, "ConjType": "Comp", "Style": "Coll"}, + "DET_Case=Gen|Distance=Med|Number=Plur|Poss=Yes|PronType=Dem": { + POS: DET, + "Case": "Gen", + "Distance": "Med", + "Number": "Plur", + "Poss": "Yes", + "PronType": "Dem", + }, + "DET_Case=Gen|Distance=Med|Number=Sing|Poss=Yes|PronType=Dem": { + POS: DET, + "Case": "Gen", + "Distance": "Med", + "Number": "Sing", + "Poss": "Yes", + "PronType": "Dem", + }, + "DET_Case=Gen|Number=Plur|Person=1|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "1", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Plur|Person=2|Polite=Infm|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "2", + "Polite": "Infm", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Plur|Person=3|Poss=Yes|PronType=Emp": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + }, + "DET_Case=Gen|Number=Plur|Person=3|Poss=Yes|PronType=Emp|Reflex=Yes": { + POS: DET, + "Case": "Gen", + "Number": "Plur", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + "Reflex": "Yes", + }, + "DET_Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "1", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Sing|Person=2|Polite=Infm|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Emp": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + }, + "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Emp|Reflex=Yes": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "Poss": "Yes", + "PronType": "Emp", + "Reflex": "Yes", + }, + "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "Poss": "Yes", + "PronType": "Prs", + }, + "DET_Case=Gen|Number=Sing|Poss=Yes|PronType=Rel": { + POS: DET, + "Case": "Gen", + "Number": "Sing", + "Poss": "Yes", + "PronType": "Rel", + }, + "DET_Distance=Dist|PronType=Dem": {POS: DET, "Distance": "Dist", "PronType": "Dem"}, + "DET_Distance=Dist|PronType=Dem|Style=Coll": { + POS: DET, + "Distance": "Dist", + "PronType": "Dem", + "Style": "Coll", + }, + "DET_Distance=Dist|PronType=Dem|Style=Vrnc": { + POS: DET, + "Distance": "Dist", + "PronType": "Dem", + "Style": "Vrnc", + }, + "DET_Distance=Med|PronType=Dem": {POS: DET, "Distance": "Med", "PronType": "Dem"}, + "DET_Distance=Med|PronType=Dem|Style=Coll": { + POS: DET, + "Distance": "Med", + "PronType": "Dem", + "Style": "Coll", + }, + "DET_Distance=Prox|PronType=Dem": {POS: DET, "Distance": "Prox", "PronType": "Dem"}, + "DET_Distance=Prox|PronType=Dem|Style=Coll": { + POS: DET, + "Distance": "Prox", + "PronType": "Dem", + "Style": "Coll", + }, + "DET_PronType=Art": {POS: DET, "PronType": "Art"}, + "DET_PronType=Exc": {POS: DET, "PronType": "Exc"}, + "DET_PronType=Ind": {POS: DET, "PronType": "Ind"}, + "DET_PronType=Int": {POS: DET, "PronType": "Int"}, + "DET_PronType=Tot": {POS: DET, "PronType": "Tot"}, + "DET_PronType=Tot|Style=Arch": {POS: DET, "PronType": "Tot", "Style": "Arch"}, + "INTJ_Style=Vrnc": {POS: INTJ, "Style": "Vrnc"}, + "NOUN_Abbr=Yes|Animacy=Nhum|Case=Dat|Definite=Ind|Number=Plur": { + POS: NOUN, + "Abbr": "Yes", + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Abbr=Yes|Animacy=Nhum|Case=Nom|Definite=Ind|Number=Sing": { + POS: NOUN, + "Abbr": "Yes", + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Def|Number=Sing|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Assoc": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Assoc", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Sing|Style=Arch": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "Style": "Arch", + }, + "NOUN_Animacy=Hum|Case=Dat|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Hum|Case=Dat|Number=Sing|Number=Sing|Person=1|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Ins|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Assoc": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Assoc", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Style=Slng": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + "Style": "Slng", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Typo=Yes": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + "Typo": "Yes", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Hum|Case=Nom|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Hum", + "Case": "Nom", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Sing|Style=Arch": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + "Style": "Arch", + }, + "NOUN_Animacy=Nhum|Case=Abl|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Abl", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|NumForm=Digit": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "NumForm": "Digit", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|NumForm=Word": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "NumForm": "Word", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|Style=Rare": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Style": "Rare", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Sing|Style=Vrnc": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Style": "Vrnc", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing|NumForm=Digit": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "NumForm": "Digit", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Sing|Style=Vrnc": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "Style": "Vrnc", + }, + "NOUN_Animacy=Nhum|Case=Dat|Number=Coll|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Number": "Coll", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Dat|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Dat|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Dat", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Gen|Definite=Ind|Number=Sing|Style=Arch": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Gen", + "Definite": "Ind", + "Number": "Sing", + "Style": "Arch", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Sing|Style=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + "Style": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Ins|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Ins", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Loc|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Loc", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Loc|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Loc", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Loc|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Loc", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Plur|Number=Sing|Poss=Yes": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + "Number": "Sing", + "Poss": "Yes", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Sing|NumForm=Digit": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "NumForm": "Digit", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Coll": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Coll", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Coll|Typo=Yes": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Coll", + "Typo": "Yes", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Plur": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Plur", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Sing": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + }, + "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + }, + "NOUN_Animacy=Nhum|Case=Nom|Number=Plur|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Number": "Plur", + "Number": "Sing", + "Person": "2", + }, + "NOUN_Animacy=Nhum|Case=Nom|Number=Sing|Number=Sing|Person=1": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Number": "Sing", + "Number": "Sing", + "Person": "1", + }, + "NOUN_Animacy=Nhum|Case=Nom|Number=Sing|Number=Sing|Person=2": { + POS: NOUN, + "Animacy": "Nhum", + "Case": "Nom", + "Number": "Sing", + "Number": "Sing", + "Person": "2", + }, + "NUM_NumForm=Digit|NumType=Card": {POS: NUM, "NumForm": "Digit", "NumType": "Card"}, + "NUM_NumForm=Digit|NumType=Frac|Typo=Yes": { + POS: NUM, + "NumForm": "Digit", + "NumType": "Frac", + "Typo": "Yes", + }, + "NUM_NumForm=Digit|NumType=Range": { + POS: NUM, + "NumForm": "Digit", + "NumType": "Range", + }, + "NUM_NumForm=Word|NumType=Card": {POS: NUM, "NumForm": "Word", "NumType": "Card"}, + "NUM_NumForm=Word|NumType=Dist": {POS: NUM, "NumForm": "Word", "NumType": "Dist"}, + "NUM_NumForm=Word|NumType=Range": {POS: NUM, "NumForm": "Word", "NumType": "Range"}, + "PART_Polarity=Neg": {POS: PART, "Polarity": "Neg"}, + "PRON_Case=Abl|Definite=Ind|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Abl", + "Definite": "Ind", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Abl|Number=Plur|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Abl", + "Number": "Plur", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Abl|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Abl", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Dat|Definite=Def|Distance=Dist|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Dat", + "Definite": "Def", + "Distance": "Dist", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Dat|Definite=Def|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Definite": "Def", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Dat|Definite=Ind|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Dat", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Dat|Distance=Dist|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Dat", + "Distance": "Dist", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Dat|Distance=Med|Number=Plur|PronType=Dem": { + POS: PRON, + "Case": "Dat", + "Distance": "Med", + "Number": "Plur", + "PronType": "Dem", + }, + "PRON_Case=Dat|Number=Plur|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Plur|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Plur|Person=3|PronType=Emp|Reflex=Yes": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "3", + "PronType": "Emp", + "Reflex": "Yes", + }, + "PRON_Case=Dat|Number=Plur|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Plur|PronType=Rcp": { + POS: PRON, + "Case": "Dat", + "Number": "Plur", + "PronType": "Rcp", + }, + "PRON_Case=Dat|Number=Sing|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Dat|Number=Sing|Person=3|PronType=Emp": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "3", + "PronType": "Emp", + }, + "PRON_Case=Dat|Number=Sing|Person=3|PronType=Emp|Reflex=Yes": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "Person": "3", + "PronType": "Emp", + "Reflex": "Yes", + }, + "PRON_Case=Dat|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Dat|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Dat", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Dat|PronType=Tot": {POS: PRON, "Case": "Dat", "PronType": "Tot"}, + "PRON_Case=Gen|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Gen", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Gen|Number=Plur|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Gen", + "Number": "Plur", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Gen|Number=Sing|Person=2|PronType=Prs": { + POS: PRON, + "Case": "Gen", + "Number": "Sing", + "Person": "2", + "PronType": "Prs", + }, + "PRON_Case=Gen|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Gen", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Gen|PronType=Tot": {POS: PRON, "Case": "Gen", "PronType": "Tot"}, + "PRON_Case=Ins|Definite=Ind|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Ins", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Ins|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Ins", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Loc|Definite=Ind|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Loc", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Loc|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Loc", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Nom|Definite=Def|Distance=Dist|Number=Plur|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Distance": "Dist", + "Number": "Plur", + "PronType": "Dem", + }, + "PRON_Case=Nom|Definite=Def|Distance=Med|Number=Sing|PronType=Dem|Style=Coll": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + "Style": "Coll", + }, + "PRON_Case=Nom|Definite=Def|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Nom|Definite=Def|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Definite": "Def", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Nom|Definite=Ind|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Nom|Definite=Ind|Number=Sing|PronType=Neg": { + POS: PRON, + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Neg", + }, + "PRON_Case=Nom|Definite=Ind|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Definite": "Ind", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Nom|Distance=Dist|Number=Plur|Person=1|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Dist", + "Number": "Plur", + "Person": "1", + "PronType": "Dem", + }, + "PRON_Case=Nom|Distance=Med|Number=Plur|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Med", + "Number": "Plur", + "PronType": "Dem", + }, + "PRON_Case=Nom|Distance=Med|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Med", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Nom|Distance=Prox|Number=Sing|PronType=Dem": { + POS: PRON, + "Case": "Nom", + "Distance": "Prox", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Case=Nom|Number=Plur|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Plur|Person=3|PronType=Emp": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "Person": "3", + "PronType": "Emp", + }, + "PRON_Case=Nom|Number=Plur|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Plur|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Number": "Plur", + "PronType": "Rel", + }, + "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Number": "Plur", + "Person": "3", + "Person": "1", + "PronType": "Emp", + }, + "PRON_Case=Nom|Number=Sing|Person=1|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "1", + "PronType": "Int", + }, + "PRON_Case=Nom|Number=Sing|Person=1|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "1", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "2", + "Polite": "Infm", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Sing|Person=3|PronType=Emp": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "3", + "PronType": "Emp", + }, + "PRON_Case=Nom|Number=Sing|Person=3|PronType=Prs": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "Person": "3", + "PronType": "Prs", + }, + "PRON_Case=Nom|Number=Sing|PronType=Int": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "PronType": "Int", + }, + "PRON_Case=Nom|Number=Sing|PronType=Rel": { + POS: PRON, + "Case": "Nom", + "Number": "Sing", + "PronType": "Rel", + }, + "PRON_Case=Nom|Person=1|PronType=Tot": { + POS: PRON, + "Case": "Nom", + "Person": "1", + "PronType": "Tot", + }, + "PRON_Case=Nom|PronType=Ind": {POS: PRON, "Case": "Nom", "PronType": "Ind"}, + "PRON_Case=Nom|PronType=Tot": {POS: PRON, "Case": "Nom", "PronType": "Tot"}, + "PRON_Distance=Dist|Number=Sing|PronType=Dem": { + POS: PRON, + "Distance": "Dist", + "Number": "Sing", + "PronType": "Dem", + }, + "PRON_Distance=Med|PronType=Dem|Style=Coll": { + POS: PRON, + "Distance": "Med", + "PronType": "Dem", + "Style": "Coll", + }, + "PRON_Distance=Prox|PronType=Dem|Style=Coll": { + POS: PRON, + "Distance": "Prox", + "PronType": "Dem", + "Style": "Coll", + }, + "PRON_Number=Plur|PronType=Rel": {POS: PRON, "Number": "Plur", "PronType": "Rel"}, + "PROPN_Abbr=Yes|Animacy=Hum|Case=Nom|Definite=Ind|NameType=Giv|Number=Sing": { + POS: PROPN, + "Abbr": "Yes", + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Giv", + "Number": "Sing", + }, + "PROPN_Abbr=Yes|Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Com|Number=Sing": { + POS: PROPN, + "Abbr": "Yes", + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Com", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Dat|Definite=Def|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Def", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Dat|Definite=Ind|NameType=Prs|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Prs", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Dat|Definite=Ind|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Def|NameType=Giv|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Giv", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Def|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Ind|NameType=Giv|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Giv", + "Number": "Sing", + }, + "PROPN_Animacy=Hum|Case=Nom|Definite=Ind|NameType=Sur|Number=Sing": { + POS: PROPN, + "Animacy": "Hum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Sur", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|NameType=Geo|Number=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Plur": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Abl", + "Definite": "Ind", + "Number": "Plur", + }, + "PROPN_Animacy=Nhum|Case=Dat|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Dat|Definite=Ind|NameType=Geo|Number=Sing|Style=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Dat", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + "Style": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Loc|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Loc", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Def|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Def|NameType=Pro|Number=Sing|Style=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Def", + "NameType": "Pro", + "Number": "Sing", + "Style": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Coll": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Coll", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Sing": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + }, + "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Sing|Style=Vrnc": { + POS: PROPN, + "Animacy": "Nhum", + "Case": "Nom", + "Definite": "Ind", + "NameType": "Geo", + "Number": "Sing", + "Style": "Vrnc", + }, + "SCONJ_Style=Coll": {POS: SCONJ, "Style": "Coll"}, + "VERB_Aspect=Dur|Polarity=Neg|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Dur|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Dur", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Style=Coll|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Style": "Coll", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Style=Vrnc|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Style": "Vrnc", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Imp|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Imp|Subcat=Tran|VerbForm=Part|Voice=Cau": { + POS: VERB, + "Aspect": "Imp", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Cau", + }, + "VERB_Aspect=Iter|Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Aspect": "Iter", + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Aspect=Iter|Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Aspect": "Iter", + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Aspect=Iter": {POS: VERB, "Aspect": "Iter"}, + "VERB_Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Style=Vrnc|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Style": "Vrnc", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Style=Vrnc|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Style": "Vrnc", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Past|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Mood": "Ind", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Past", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Polarity=Neg|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Polarity=Pos|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Polarity": "Pos", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Perf|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Perf|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Perf|Subcat=Tran|VerbForm=Part|Voice=Cau": { + POS: VERB, + "Aspect": "Perf", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Cau", + }, + "VERB_Aspect=Prog|Subcat=Intr|VerbForm=Conv|Voice=Mid": { + POS: VERB, + "Aspect": "Prog", + "Subcat": "Intr", + "VerbForm": "Conv", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Connegative=Yes|Mood=Cnd|Subcat=Tran|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Connegative": "Yes", + "Mood": "Cnd", + "Subcat": "Tran", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Plur|Person=3|Polarity=Pos|Style=Vrnc|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Style": "Vrnc", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Plur|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=1|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Pass": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Pass", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Cnd|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Cnd", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Imp|Number=Sing|Person=2|Subcat=Intr|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Imp", + "Number": "Sing", + "Person": "2", + "Subcat": "Intr", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Imp|Number=Sing|Person=2|Subcat=Tran|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Imp", + "Number": "Sing", + "Person": "2", + "Subcat": "Tran", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Plur|Person=1|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Plur", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Plur|Person=3|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Plur", + "Person": "3", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Plur|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Plur", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Neg|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=1|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "1", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=2|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "2", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Imp|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|Tense=Pres|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Intr|VerbForm=Fin|Voice=Pass": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Fin", + "Voice": "Pass", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Imp|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Imp", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Number=Sing|Person=3|Polarity=Pos|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Number": "Sing", + "Person": "3", + "Polarity": "Pos", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Mood=Sub|Person=1|Polarity=Neg|Subcat=Tran|Tense=Pres|VerbForm=Fin|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Mood": "Sub", + "Person": "1", + "Polarity": "Neg", + "Subcat": "Tran", + "Tense": "Pres", + "VerbForm": "Fin", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Aspect=Prosp|Subcat=Intr|VerbForm=Part|Voice=Mid": { + POS: VERB, + "Aspect": "Prosp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Mid", + }, + "VERB_Aspect=Prosp|Subcat=Intr|VerbForm=Part|Voice=Pass": { + POS: VERB, + "Aspect": "Prosp", + "Subcat": "Intr", + "VerbForm": "Part", + "Voice": "Pass", + }, + "VERB_Aspect=Prosp|Subcat=Tran|VerbForm=Part|Voice=Act": { + POS: VERB, + "Aspect": "Prosp", + "Subcat": "Tran", + "VerbForm": "Part", + "Voice": "Act", + }, + "VERB_Case=Abl|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Abl|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Pass": { + POS: VERB, + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Pass", + }, + "VERB_Case=Abl|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Abl", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Case=Dat|Definite=Def|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Dat", + "Definite": "Def", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Dat|Definite=Ind|Number=Coll|Polarity=Neg|Subcat=Intr|VerbForm=Gdv|Voice=Pass": { + POS: VERB, + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Pass", + }, + "VERB_Case=Dat|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Dat|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Dat", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Ins|Definite=Ind|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Ins", + "Definite": "Ind", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Case=Nom|Definite=Def|Number=Coll|Polarity=Pos|Subcat=Intr|VerbForm=Gdv|Voice=Mid": { + POS: VERB, + "Case": "Nom", + "Definite": "Def", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Gdv", + "Voice": "Mid", + }, + "VERB_Case=Nom|Definite=Def|Number=Coll|Polarity=Pos|Subcat=Tran|VerbForm=Gdv|Voice=Act": { + POS: VERB, + "Case": "Nom", + "Definite": "Def", + "Number": "Coll", + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Gdv", + "Voice": "Act", + }, + "VERB_Mood=Imp|Number=Sing|Person=2|Subcat=Intr|VerbForm=Fin|Voice=Mid": { + POS: VERB, + "Mood": "Imp", + "Number": "Sing", + "Person": "2", + "Subcat": "Intr", + "VerbForm": "Fin", + "Voice": "Mid", + }, + "VERB_Polarity=Neg|Subcat=Intr|VerbForm=Inf|Voice=Mid": { + POS: VERB, + "Polarity": "Neg", + "Subcat": "Intr", + "VerbForm": "Inf", + "Voice": "Mid", + }, + "VERB_Polarity=Pos|Style=Coll|Subcat=Tran|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Style": "Coll", + "Subcat": "Tran", + "VerbForm": "Inf", + "Voice": "Act", + }, + "VERB_Polarity=Pos|Style=Vrnc|Subcat=Tran|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Style": "Vrnc", + "Subcat": "Tran", + "VerbForm": "Inf", + "Voice": "Act", + }, + "VERB_Polarity=Pos|Subcat=Intr|VerbForm=Inf|Voice=Mid": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Inf", + "Voice": "Mid", + }, + "VERB_Polarity=Pos|Subcat=Intr|VerbForm=Inf|Voice=Pass": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Intr", + "VerbForm": "Inf", + "Voice": "Pass", + }, + "VERB_Polarity=Pos|Subcat=Tran|Typo=Yes|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Tran", + "Typo": "Yes", + "VerbForm": "Inf", + "Voice": "Act", + }, + "VERB_Polarity=Pos|Subcat=Tran|VerbForm=Inf|Voice=Act": { + POS: VERB, + "Polarity": "Pos", + "Subcat": "Tran", + "VerbForm": "Inf", + "Voice": "Act", + }, + "X_Foreign=Yes": {POS: X, "Foreign": "Yes"}, + "X_Style=Vrnc": {POS: X, "Style": "Vrnc"}, +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index fc89c2658..43c3152a0 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -234,3 +234,7 @@ def yo_tokenizer(): def zh_tokenizer(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() + +@pytest.fixture(scope="session") +def hy_tokenizer(): + return get_lang_class("hy").Defaults.create_tokenizer() \ No newline at end of file diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py new file mode 100644 index 000000000..6b785bdfc --- /dev/null +++ b/spacy/tests/lang/hy/test_text.py @@ -0,0 +1,10 @@ +from __future__ import unicode_literals + +import pytest +from spacy.lang.hy.lex_attrs import like_num + + +@pytest.mark.parametrize("word", ["հիսուն"]) +def test_hy_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py new file mode 100644 index 000000000..424fb886f --- /dev/null +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import pytest + + +# TODO add test cases with valid punctuation signs. + +hy_tokenize_text_test = [ + ( + "Մետաղագիտությունը պայմանականորեն բաժանվում է տեսականի և կիրառականի (տեխնիկական)", + [ + "Մետաղագիտությունը", + "պայմանականորեն", + "բաժանվում", + "է", + "տեսականի", + "և", + "կիրառականի", + "(", + "տեխնիկական", + ")", + ], + ), + ( + "Գետաբերանը գտնվում է Օմոլոնա գետի ձախ ափից 726 կմ հեռավորության վրա", + [ + "Գետաբերանը", + "գտնվում", + "է", + "Օմոլոնա", + "գետի", + "ձախ", + "ափից", + "726", + "կմ", + "հեռավորության", + "վրա", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", hy_tokenize_text_test) +def test_ga_tokenizer_handles_exception_cases(hy_tokenizer, text, expected_tokens): + tokens = hy_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From ddf3c2430d2a6ea4e1d3ed6ac15740f116134cea Mon Sep 17 00:00:00 2001 From: nlptechbook <60931109+nlptechbook@users.noreply.github.com> Date: Fri, 3 Apr 2020 12:10:03 -0400 Subject: [PATCH 053/105] Update universe.json --- website/meta/universe.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 8f8bcfecd..6c9fc0340 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1624,8 +1624,9 @@ "title": "pic2phrase_bot: Photo Description Generator", "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy." - "thumb": "https://drive.google.com/open?id=1GTrpPzc8j4mAmYCJZibYrADAp0GWcVHd", - "image": "https://drive.google.com/open?id=1t7URKJ-4uOJmZb_GbNvw-F5LLtvEoBRy", + "thumb": "https://i.imgur.com/ggVI02O.jpg", + "image": "https://i.imgur.com/z1yhWQR.jpg", + "url": "https://telegram.me/pic2phrase_bot", "author": "Yuli Vasiliev", "author_links": { "twitter": "VasilievYuli", From 406d5748b39919dc3996fad1e41e92735f50be45 Mon Sep 17 00:00:00 2001 From: Muhammad Irfan Date: Sun, 5 Apr 2020 20:55:38 +0500 Subject: [PATCH 054/105] add missing Urdu tags --- spacy/lang/ur/tag_map.py | 126 ++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 49 deletions(-) diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index 2499d7e3e..eebd3a14a 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -1,66 +1,94 @@ # coding: utf8 from __future__ import unicode_literals +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON TAG_MAP = { - ".": {POS: PUNCT, "PunctType": "peri"}, - ",": {POS: PUNCT, "PunctType": "comm"}, - "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, - "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, - "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, - '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, - "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "JJ-Ez": {POS: ADJ}, + "INJC": {POS: X}, + "QFC": {POS: DET}, + "UNK": {POS: X}, + "NSTC": {POS: ADV}, + "NST": {POS: ADV}, + "VMC": {POS: VERB}, + "PRPC": {POS: PRON}, + "RBC": {POS: ADV}, + "PSPC": {POS: ADP}, + "INJ": {POS: X}, + "JJZ": {POS: ADJ}, + "CCC": {POS: SCONJ}, + "NN-Ez": {POS: NOUN}, + "ECH": {POS: NOUN}, + "WQ": {POS: DET}, + "RDP": {POS: ADJ}, + "JJC": {POS: ADJ}, + "NEG": {POS: PART}, + "NNZ": {POS: NOUN}, + "QO": {POS: ADJ}, + "INTFC": {POS: ADV}, + "INTF": {POS: ADV}, + "NFC": {POS: ADP}, + "QCC": {POS: NUM}, + "QC": {POS: NUM}, + "QF": {POS: DET}, + "VAUX": {POS: AUX}, + "VM": {POS: VERB}, + "DEM": {POS: DET}, + "NNPC": {POS: PROPN}, + "NNC": {POS: NOUN}, + "PSP": {POS: ADP}, + + ".": {POS: PUNCT}, + ",": {POS: PUNCT}, + "-LRB-": {POS: PUNCT}, + "-RRB-": {POS: PUNCT}, + "``": {POS: PUNCT}, + '""': {POS: PUNCT}, + "''": {POS: PUNCT}, ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, - "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CCONJ, "ConjType": "coor"}, - "CD": {POS: NUM, "NumType": "card"}, + "$": {POS: SYM}, + "#": {POS: SYM}, + "AFX": {POS: ADJ}, + "CC": {POS: CCONJ}, + "CD": {POS: NUM}, "DT": {POS: DET}, - "EX": {POS: ADV, "AdvType": "ex"}, - "FW": {POS: X, "Foreign": "yes"}, - "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "EX": {POS: ADV}, + "FW": {POS: X}, + "HYPH": {POS: PUNCT}, "IN": {POS: ADP}, - "JJ": {POS: ADJ, "Degree": "pos"}, - "JJR": {POS: ADJ, "Degree": "comp"}, - "JJS": {POS: ADJ, "Degree": "sup"}, - "LS": {POS: PUNCT, "NumType": "ord"}, - "MD": {POS: VERB, "VerbType": "mod"}, + "JJ": {POS: ADJ}, + "JJR": {POS: ADJ}, + "JJS": {POS: ADJ}, + "LS": {POS: PUNCT}, + "MD": {POS: VERB}, "NIL": {POS: ""}, - "NN": {POS: NOUN, "Number": "sing"}, - "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, - "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, - "NNS": {POS: NOUN, "Number": "plur"}, - "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, - "POS": {POS: PART, "Poss": "yes"}, - "PRP": {POS: PRON, "PronType": "prs"}, - "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, - "RB": {POS: ADV, "Degree": "pos"}, - "RBR": {POS: ADV, "Degree": "comp"}, - "RBS": {POS: ADV, "Degree": "sup"}, + "NN": {POS: NOUN}, + "NNP": {POS: PROPN}, + "NNPS": {POS: PROPN}, + "NNS": {POS: NOUN}, + "PDT": {POS: ADJ}, + "POS": {POS: PART}, + "PRP": {POS: PRON}, + "PRP$": {POS: ADJ}, + "RB": {POS: ADV}, + "RBR": {POS: ADV}, + "RBS": {POS: ADV}, "RP": {POS: PART}, "SP": {POS: SPACE}, "SYM": {POS: SYM}, - "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "TO": {POS: PART}, "UH": {POS: INTJ}, - "VB": {POS: VERB, "VerbForm": "inf"}, - "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, - "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, - "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, - "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, - "VBZ": { - POS: VERB, - "VerbForm": "fin", - "Tense": "pres", - "Number": "sing", - "Person": 3, - }, - "WDT": {POS: ADJ, "PronType": "int|rel"}, - "WP": {POS: NOUN, "PronType": "int|rel"}, - "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, - "WRB": {POS: ADV, "PronType": "int|rel"}, + "VB": {POS: VERB}, + "VBD": {POS: VERB}, + "VBG": {POS: VERB}, + "VBN": {POS: VERB}, + "VBP": {POS: VERB}, + "VBZ": {POS: VERB}, + "WDT": {POS: ADJ}, + "WP": {POS: NOUN}, + "WP$": {POS: ADJ}, + "WRB": {POS: ADV}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, From f329d5663a3caca726bf820307448c361e346016 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Mon, 6 Apr 2020 11:29:30 +0200 Subject: [PATCH 055/105] add "whatlies" to spaCy universe (#5252) * Add "whatlies" We're releasing it on our side officially on the 16th of April. If possible, let's announce around the same time :) * sign contributor thing * Added fancy gif as the image * Update universe.json Spellin error and spaCy clarification. --- .github/contributors/koaning.md | 106 ++++++++++++++++++++++++++++++++ website/meta/universe.json | 28 +++++++++ 2 files changed, 134 insertions(+) create mode 100644 .github/contributors/koaning.md diff --git a/.github/contributors/koaning.md b/.github/contributors/koaning.md new file mode 100644 index 000000000..ddb28cab0 --- /dev/null +++ b/.github/contributors/koaning.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Vincent D. Warmerdam | +| Company name (if applicable) | | +| Title or role (if applicable) | Data Person | +| Date | 2020-03-01 | +| GitHub username | koaning | +| Website (optional) | https://koaning.io | diff --git a/website/meta/universe.json b/website/meta/universe.json index 613648d8c..bbd67e8a6 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,33 @@ { "resources": [ + { + "id": "whatlies", + "title": "whatlies", + "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.", + "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.", + "github": "rasahq/whatlies", + "pip": "whatlies", + "thumb": "https://i.imgur.com/rOkOiLv.png", + "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif", + "code_example": [ + "from whatlies import EmbeddingSet", + "from whatlies.language import SpacyLanguage", + "", + "lang = SpacyLanguage('en_core_web_md')", + "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', ', + 'king', 'queen', 'doctor', 'nurse']", + "", + "emb = lang[words]", + "emb.plot_interactive(x_axis='man', y_axis='woman')" + ], + "category": ["visualizers", "research"], + "author": "Vincent D. Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning", + "website": "https://koaning.io" + } + }, { "id": "spacy-stanza", "title": "spacy-stanza", From f4ef64a5264d4cd7f57059150cebeda388dd202d Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:18:07 +0200 Subject: [PATCH 056/105] Improve tokenization for UD Dutch corpora (#5259) * Improve tokenization for UD Dutch corpora Improve tokenization for UD Dutch Alpino and LassySmall. * Format Dutch tokenizer exceptions --- spacy/lang/nl/__init__.py | 4 +- spacy/lang/nl/punctuation.py | 41 +- spacy/lang/nl/tokenizer_exceptions.py | 1890 +++++++++++++++++++++---- 3 files changed, 1611 insertions(+), 324 deletions(-) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 074fd9133..407d23f73 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -5,7 +5,8 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -25,6 +26,7 @@ class DutchDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py index a48ecc044..e7207038b 100644 --- a/spacy/lang/nl/punctuation.py +++ b/spacy/lang/nl/punctuation.py @@ -1,10 +1,14 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_UNITS, merge_chars +from ..char_classes import LIST_PUNCT, LIST_QUOTES, CURRENCY, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import TOKENIZER_SUFFIXES as DEFAULT_TOKENIZER_SUFFIXES +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES + + +_prefixes = [",,"] + BASE_TOKENIZER_PREFIXES # Copied from `de` package. Main purpose is to ensure that hyphens are not @@ -22,20 +26,33 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), - r"(?<=[0-9])-(?=[0-9])", ] ) -# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when -# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid -# ambiguity it's better to just leave it attached when it occurs as a suffix. -default_suffix_blacklist = ("'s", "'S", "’s", "’S") -_suffixes = [ - suffix - for suffix in DEFAULT_TOKENIZER_SUFFIXES - if suffix not in default_suffix_blacklist -] +_list_units = [u for u in LIST_UNITS if u != "%"] +_units = merge_chars(" ".join(_list_units)) +_suffixes = ( + ["''"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + + +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py index dbdd104f3..c0915f127 100644 --- a/spacy/lang/nl/tokenizer_exceptions.py +++ b/spacy/lang/nl/tokenizer_exceptions.py @@ -16,317 +16,1585 @@ from ...symbols import ORTH # are extremely domain-specific. Tokenizer performance may benefit from some # slight pruning, although no performance regression has been observed so far. -# fmt: off -abbrevs = ['a.2d.', 'a.a.', 'a.a.j.b.', 'a.f.t.', 'a.g.j.b.', - 'a.h.v.', 'a.h.w.', 'a.hosp.', 'a.i.', 'a.j.b.', 'a.j.t.', - 'a.m.', 'a.m.r.', 'a.p.m.', 'a.p.r.', 'a.p.t.', 'a.s.', - 'a.t.d.f.', 'a.u.b.', 'a.v.a.', 'a.w.', 'aanbev.', - 'aanbev.comm.', 'aant.', 'aanv.st.', 'aanw.', 'vnw.', - 'aanw.vnw.', 'abd.', 'abm.', 'abs.', 'acc.act.', - 'acc.bedr.m.', 'acc.bedr.t.', 'achterv.', 'act.dr.', - 'act.dr.fam.', 'act.fisc.', 'act.soc.', 'adm.akk.', - 'adm.besl.', 'adm.lex.', 'adm.onderr.', 'adm.ov.', 'adv.', - 'adv.', 'gen.', 'adv.bl.', 'afd.', 'afl.', 'aggl.verord.', - 'agr.', 'al.', 'alg.', 'alg.richts.', 'amén.', 'ann.dr.', - 'ann.dr.lg.', 'ann.dr.sc.pol.', 'ann.ét.eur.', - 'ann.fac.dr.lg.', 'ann.jur.créd.', - 'ann.jur.créd.règl.coll.', 'ann.not.', 'ann.parl.', - 'ann.prat.comm.', 'app.', 'arb.', 'aud.', 'arbbl.', - 'arbh.', 'arbit.besl.', 'arbrb.', 'arr.', 'arr.cass.', - 'arr.r.v.st.', 'arr.verbr.', 'arrondrb.', 'art.', 'artw.', - 'aud.', 'b.', 'b.', 'b.&w.', 'b.a.', 'b.a.s.', 'b.b.o.', - 'b.best.dep.', 'b.br.ex.', 'b.coll.fr.gem.comm.', - 'b.coll.vl.gem.comm.', 'b.d.cult.r.', 'b.d.gem.ex.', - 'b.d.gem.reg.', 'b.dep.', 'b.e.b.', 'b.f.r.', - 'b.fr.gem.ex.', 'b.fr.gem.reg.', 'b.i.h.', 'b.inl.j.d.', - 'b.inl.s.reg.', 'b.j.', 'b.l.', 'b.o.z.', 'b.prov.r.', - 'b.r.h.', 'b.s.', 'b.sr.', 'b.stb.', 'b.t.i.r.', - 'b.t.s.z.', 'b.t.w.rev.', 'b.v.', - 'b.ver.coll.gem.gem.comm.', 'b.verg.r.b.', 'b.versl.', - 'b.vl.ex.', 'b.voorl.reg.', 'b.w.', 'b.w.gew.ex.', - 'b.z.d.g.', 'b.z.v.', 'bab.', 'bedr.org.', 'begins.', - 'beheersov.', 'bekendm.comm.', 'bel.', 'bel.besch.', - 'bel.w.p.', 'beleidsov.', 'belg.', 'grondw.', 'ber.', - 'ber.w.', 'besch.', 'besl.', 'beslagr.', 'bestuurswet.', - 'bet.', 'betr.', 'betr.', 'vnw.', 'bevest.', 'bew.', - 'bijbl.', 'ind.', 'eig.', 'bijbl.n.bijdr.', 'bijl.', - 'bijv.', 'bijw.', 'bijz.decr.', 'bin.b.', 'bkh.', 'bl.', - 'blz.', 'bm.', 'bn.', 'rh.', 'bnw.', 'bouwr.', 'br.parl.', - 'bs.', 'bull.', 'bull.adm.pénit.', 'bull.ass.', - 'bull.b.m.m.', 'bull.bel.', 'bull.best.strafinr.', - 'bull.bmm.', 'bull.c.b.n.', 'bull.c.n.c.', 'bull.cbn.', - 'bull.centr.arb.', 'bull.cnc.', 'bull.contr.', - 'bull.doc.min.fin.', 'bull.f.e.b.', 'bull.feb.', - 'bull.fisc.fin.r.', 'bull.i.u.m.', - 'bull.inf.ass.secr.soc.', 'bull.inf.i.e.c.', - 'bull.inf.i.n.a.m.i.', 'bull.inf.i.r.e.', 'bull.inf.iec.', - 'bull.inf.inami.', 'bull.inf.ire.', 'bull.inst.arb.', - 'bull.ium.', 'bull.jur.imm.', 'bull.lég.b.', 'bull.off.', - 'bull.trim.b.dr.comp.', 'bull.us.', 'bull.v.b.o.', - 'bull.vbo.', 'bv.', 'bw.', 'bxh.', 'byz.', 'c.', 'c.a.', - 'c.a.-a.', 'c.a.b.g.', 'c.c.', 'c.c.i.', 'c.c.s.', - 'c.conc.jur.', 'c.d.e.', 'c.d.p.k.', 'c.e.', 'c.ex.', - 'c.f.', 'c.h.a.', 'c.i.f.', 'c.i.f.i.c.', 'c.j.', 'c.l.', - 'c.n.', 'c.o.d.', 'c.p.', 'c.pr.civ.', 'c.q.', 'c.r.', - 'c.r.a.', 'c.s.', 'c.s.a.', 'c.s.q.n.', 'c.v.', 'c.v.a.', - 'c.v.o.', 'ca.', 'cadeaust.', 'cah.const.', - 'cah.dr.europ.', 'cah.dr.immo.', 'cah.dr.jud.', 'cal.', - '2d.', 'cal.', '3e.', 'cal.', 'rprt.', 'cap.', 'carg.', - 'cass.', 'cass.', 'verw.', 'cert.', 'cf.', 'ch.', 'chron.', - 'chron.d.s.', 'chron.dr.not.', 'cie.', 'cie.', - 'verz.schr.', 'cir.', 'circ.', 'circ.z.', 'cit.', - 'cit.loc.', 'civ.', 'cl.et.b.', 'cmt.', 'co.', - 'cognoss.v.', 'coll.', 'v.', 'b.', 'colp.w.', 'com.', - 'com.', 'cas.', 'com.v.min.', 'comm.', 'comm.', 'v.', - 'comm.bijz.ov.', 'comm.erf.', 'comm.fin.', 'comm.ger.', - 'comm.handel.', 'comm.pers.', 'comm.pub.', 'comm.straf.', - 'comm.v.', 'comm.venn.', 'comm.verz.', 'comm.voor.', - 'comp.', 'compt.w.', 'computerr.', 'con.m.', 'concl.', - 'concr.', 'conf.', 'confl.w.', 'confl.w.huwbetr.', 'cons.', - 'conv.', 'coöp.', 'ver.', 'corr.', 'corr.bl.', - 'cour.fisc.', 'cour.immo.', 'cridon.', 'crim.', 'cur.', - 'cur.', 'crt.', 'curs.', 'd.', 'd.-g.', 'd.a.', 'd.a.v.', - 'd.b.f.', 'd.c.', 'd.c.c.r.', 'd.d.', 'd.d.p.', 'd.e.t.', - 'd.gem.r.', 'd.h.', 'd.h.z.', 'd.i.', 'd.i.t.', 'd.j.', - 'd.l.r.', 'd.m.', 'd.m.v.', 'd.o.v.', 'd.parl.', 'd.w.z.', - 'dact.', 'dat.', 'dbesch.', 'dbesl.', 'decr.', 'decr.d.', - 'decr.fr.', 'decr.vl.', 'decr.w.', 'def.', 'dep.opv.', - 'dep.rtl.', 'derg.', 'desp.', 'det.mag.', 'deurw.regl.', - 'dez.', 'dgl.', 'dhr.', 'disp.', 'diss.', 'div.', - 'div.act.', 'div.bel.', 'dl.', 'dln.', 'dnotz.', 'doc.', - 'hist.', 'doc.jur.b.', 'doc.min.fin.', 'doc.parl.', - 'doctr.', 'dpl.', 'dpl.besl.', 'dr.', 'dr.banc.fin.', - 'dr.circ.', 'dr.inform.', 'dr.mr.', 'dr.pén.entr.', - 'dr.q.m.', 'drs.', 'dtp.', 'dwz.', 'dyn.', 'e.', 'e.a.', - 'e.b.', 'tek.mod.', 'e.c.', 'e.c.a.', 'e.d.', 'e.e.', - 'e.e.a.', 'e.e.g.', 'e.g.', 'e.g.a.', 'e.h.a.', 'e.i.', - 'e.j.', 'e.m.a.', 'e.n.a.c.', 'e.o.', 'e.p.c.', 'e.r.c.', - 'e.r.f.', 'e.r.h.', 'e.r.o.', 'e.r.p.', 'e.r.v.', - 'e.s.r.a.', 'e.s.t.', 'e.v.', 'e.v.a.', 'e.w.', 'e&o.e.', - 'ec.pol.r.', 'econ.', 'ed.', 'ed(s).', 'eff.', 'eig.', - 'eig.mag.', 'eil.', 'elektr.', 'enmb.', 'enz.', 'err.', - 'etc.', 'etq.', 'eur.', 'parl.', 'eur.t.s.', 'ev.', 'evt.', - 'ex.', 'ex.crim.', 'exec.', 'f.', 'f.a.o.', 'f.a.q.', - 'f.a.s.', 'f.i.b.', 'f.j.f.', 'f.o.b.', 'f.o.r.', 'f.o.s.', - 'f.o.t.', 'f.r.', 'f.supp.', 'f.suppl.', 'fa.', 'facs.', - 'fasc.', 'fg.', 'fid.ber.', 'fig.', 'fin.verh.w.', 'fisc.', - 'fisc.', 'tijdschr.', 'fisc.act.', 'fisc.koer.', 'fl.', - 'form.', 'foro.', 'it.', 'fr.', 'fr.cult.r.', 'fr.gem.r.', - 'fr.parl.', 'fra.', 'ft.', 'g.', 'g.a.', 'g.a.v.', - 'g.a.w.v.', 'g.g.d.', 'g.m.t.', 'g.o.', 'g.omt.e.', 'g.p.', - 'g.s.', 'g.v.', 'g.w.w.', 'geb.', 'gebr.', 'gebrs.', - 'gec.', 'gec.decr.', 'ged.', 'ged.st.', 'gedipl.', - 'gedr.st.', 'geh.', 'gem.', 'gem.', 'gem.', - 'gem.gem.comm.', 'gem.st.', 'gem.stem.', 'gem.w.', - 'gemeensch.optr.', 'gemeensch.standp.', 'gemeensch.strat.', - 'gemeent.', 'gemeent.b.', 'gemeent.regl.', - 'gemeent.verord.', 'geol.', 'geopp.', 'gepubl.', - 'ger.deurw.', 'ger.w.', 'gerekw.', 'gereq.', 'gesch.', - 'get.', 'getr.', 'gev.m.', 'gev.maatr.', 'gew.', 'ghert.', - 'gir.eff.verk.', 'gk.', 'gr.', 'gramm.', 'grat.w.', - 'grootb.w.', 'grs.', 'grvm.', 'grw.', 'gst.', 'gw.', - 'h.a.', 'h.a.v.o.', 'h.b.o.', 'h.e.a.o.', 'h.e.g.a.', - 'h.e.geb.', 'h.e.gestr.', 'h.l.', 'h.m.', 'h.o.', 'h.r.', - 'h.t.l.', 'h.t.m.', 'h.w.geb.', 'hand.', 'handelsn.w.', - 'handelspr.', 'handelsr.w.', 'handelsreg.w.', 'handv.', - 'harv.l.rev.', 'hc.', 'herald.', 'hert.', 'herz.', - 'hfdst.', 'hfst.', 'hgrw.', 'hhr.', 'hist.', 'hooggel.', - 'hoogl.', 'hosp.', 'hpw.', 'hr.', 'hr.', 'ms.', 'hr.ms.', - 'hregw.', 'hrg.', 'hst.', 'huis.just.', 'huisv.w.', - 'huurbl.', 'hv.vn.', 'hw.', 'hyp.w.', 'i.b.s.', 'i.c.', - 'i.c.m.h.', 'i.e.', 'i.f.', 'i.f.p.', 'i.g.v.', 'i.h.', - 'i.h.a.', 'i.h.b.', 'i.l.pr.', 'i.o.', 'i.p.o.', 'i.p.r.', - 'i.p.v.', 'i.pl.v.', 'i.r.d.i.', 'i.s.m.', 'i.t.t.', - 'i.v.', 'i.v.m.', 'i.v.s.', 'i.w.tr.', 'i.z.', 'ib.', - 'ibid.', 'icip-ing.cons.', 'iem.', 'indic.soc.', 'indiv.', - 'inf.', 'inf.i.d.a.c.', 'inf.idac.', 'inf.r.i.z.i.v.', - 'inf.riziv.', 'inf.soc.secr.', 'ing.', 'ing.', 'cons.', - 'ing.cons.', 'inst.', 'int.', 'int.', 'rechtsh.', - 'strafz.', 'interm.', 'intern.fisc.act.', - 'intern.vervoerr.', 'inv.', 'inv.', 'f.', 'inv.w.', - 'inv.wet.', 'invord.w.', 'inz.', 'ir.', 'irspr.', 'iwtr.', - 'j.', 'j.-cl.', 'j.c.b.', 'j.c.e.', 'j.c.fl.', 'j.c.j.', - 'j.c.p.', 'j.d.e.', 'j.d.f.', 'j.d.s.c.', 'j.dr.jeun.', - 'j.j.d.', 'j.j.p.', 'j.j.pol.', 'j.l.', 'j.l.m.b.', - 'j.l.o.', 'j.p.a.', 'j.r.s.', 'j.t.', 'j.t.d.e.', - 'j.t.dr.eur.', 'j.t.o.', 'j.t.t.', 'jaarl.', 'jb.hand.', - 'jb.kred.', 'jb.kred.c.s.', 'jb.l.r.b.', 'jb.lrb.', - 'jb.markt.', 'jb.mens.', 'jb.t.r.d.', 'jb.trd.', - 'jeugdrb.', 'jeugdwerkg.w.', 'jg.', 'jis.', 'jl.', - 'journ.jur.', 'journ.prat.dr.fisc.fin.', 'journ.proc.', - 'jrg.', 'jur.', 'jur.comm.fl.', 'jur.dr.soc.b.l.n.', - 'jur.f.p.e.', 'jur.fpe.', 'jur.niv.', 'jur.trav.brux.', - 'jurambt.', 'jv.cass.', 'jv.h.r.j.', 'jv.hrj.', 'jw.', - 'k.', 'k.', 'k.b.', 'k.g.', 'k.k.', 'k.m.b.o.', 'k.o.o.', - 'k.v.k.', 'k.v.v.v.', 'kadasterw.', 'kaderb.', 'kador.', - 'kbo-nr.', 'kg.', 'kh.', 'kiesw.', 'kind.bes.v.', 'kkr.', - 'koopv.', 'kr.', 'krankz.w.', 'ksbel.', 'kt.', 'ktg.', - 'ktr.', 'kvdm.', 'kw.r.', 'kymr.', 'kzr.', 'kzw.', 'l.', - 'l.b.', 'l.b.o.', 'l.bas.', 'l.c.', 'l.gew.', 'l.j.', - 'l.k.', 'l.l.', 'l.o.', 'l.r.b.', 'l.u.v.i.', 'l.v.r.', - 'l.v.w.', 'l.w.', "l'exp.-compt.b..", 'l’exp.-compt.b.', - 'landinr.w.', 'landscrt.', 'lat.', 'law.ed.', 'lett.', - 'levensverz.', 'lgrs.', 'lidw.', 'limb.rechtsl.', 'lit.', - 'litt.', 'liw.', 'liwet.', 'lk.', 'll.', 'll.(l.)l.r.', - 'loonw.', 'losbl.', 'ltd.', 'luchtv.', 'luchtv.w.', 'm.', - 'm.', 'not.', 'm.a.v.o.', 'm.a.w.', 'm.b.', 'm.b.o.', - 'm.b.r.', 'm.b.t.', 'm.d.g.o.', 'm.e.a.o.', 'm.e.r.', - 'm.h.', 'm.h.d.', 'm.i.v.', 'm.j.t.', 'm.k.', 'm.m.', - 'm.m.a.', 'm.m.h.h.', 'm.m.v.', 'm.n.', 'm.not.fisc.', - 'm.nt.', 'm.o.', 'm.r.', 'm.s.a.', 'm.u.p.', 'm.v.a.', - 'm.v.h.n.', 'm.v.t.', 'm.z.', 'maatr.teboekgest.luchtv.', - 'maced.', 'mand.', 'max.', 'mbl.not.', 'me.', 'med.', - 'med.', 'v.b.o.', 'med.b.u.f.r.', 'med.bufr.', 'med.vbo.', - 'meerv.', 'meetbr.w.', 'mém.adm.', 'mgr.', 'mgrs.', 'mhd.', - 'mi.verantw.', 'mil.', 'mil.bed.', 'mil.ger.', 'min.', - 'min.', 'aanbev.', 'min.', 'circ.', 'min.', 'fin.', - 'min.j.omz.', 'min.just.circ.', 'mitt.', 'mnd.', 'mod.', - 'mon.', 'mouv.comm.', 'mr.', 'ms.', 'muz.', 'mv.', 'n.', - 'chr.', 'n.a.', 'n.a.g.', 'n.a.v.', 'n.b.', 'n.c.', - 'n.chr.', 'n.d.', 'n.d.r.', 'n.e.a.', 'n.g.', 'n.h.b.c.', - 'n.j.', 'n.j.b.', 'n.j.w.', 'n.l.', 'n.m.', 'n.m.m.', - 'n.n.', 'n.n.b.', 'n.n.g.', 'n.n.k.', 'n.o.m.', 'n.o.t.k.', - 'n.rapp.', 'n.tijd.pol.', 'n.v.', 'n.v.d.r.', 'n.v.d.v.', - 'n.v.o.b.', 'n.v.t.', 'nat.besch.w.', 'nat.omb.', - 'nat.pers.', 'ned.cult.r.', 'neg.verkl.', 'nhd.', 'wisk.', - 'njcm-bull.', 'nl.', 'nnd.', 'no.', 'not.fisc.m.', - 'not.w.', 'not.wet.', 'nr.', 'nrs.', 'nste.', 'nt.', - 'numism.', 'o.', 'o.a.', 'o.b.', 'o.c.', 'o.g.', 'o.g.v.', - 'o.i.', 'o.i.d.', 'o.m.', 'o.o.', 'o.o.d.', 'o.o.v.', - 'o.p.', 'o.r.', 'o.regl.', 'o.s.', 'o.t.s.', 'o.t.t.', - 'o.t.t.t.', 'o.t.t.z.', 'o.tk.t.', 'o.v.t.', 'o.v.t.t.', - 'o.v.tk.t.', 'o.v.v.', 'ob.', 'obsv.', 'octr.', - 'octr.gem.regl.', 'octr.regl.', 'oe.', 'off.pol.', 'ofra.', - 'ohd.', 'omb.', 'omnil.', 'omz.', 'on.ww.', 'onderr.', - 'onfrank.', 'onteig.w.', 'ontw.', 'b.w.', 'onuitg.', - 'onz.', 'oorl.w.', 'op.cit.', 'opin.pa.', 'opm.', 'or.', - 'ord.br.', 'ord.gem.', 'ors.', 'orth.', 'os.', 'osm.', - 'ov.', 'ov.w.i.', 'ov.w.ii.', 'ov.ww.', 'overg.w.', - 'overw.', 'ovkst.', 'oz.', 'p.', 'p.a.', 'p.a.o.', - 'p.b.o.', 'p.e.', 'p.g.', 'p.j.', 'p.m.', 'p.m.a.', 'p.o.', - 'p.o.j.t.', 'p.p.', 'p.v.', 'p.v.s.', 'pachtw.', 'pag.', - 'pan.', 'pand.b.', 'pand.pér.', 'parl.gesch.', - 'parl.gesch.', 'inv.', 'parl.st.', 'part.arb.', 'pas.', - 'pasin.', 'pat.', 'pb.c.', 'pb.l.', 'pens.', - 'pensioenverz.', 'per.ber.i.b.r.', 'per.ber.ibr.', 'pers.', - 'st.', 'pft.', 'pk.', 'pktg.', 'plv.', 'po.', 'pol.', - 'pol.off.', 'pol.r.', 'pol.w.', 'postbankw.', 'postw.', - 'pp.', 'pr.', 'preadv.', 'pres.', 'prf.', 'prft.', 'prg.', - 'prijz.w.', 'proc.', 'procesregl.', 'prof.', 'prot.', - 'prov.', 'prov.b.', 'prov.instr.h.m.g.', 'prov.regl.', - 'prov.verord.', 'prov.w.', 'publ.', 'pun.', 'pw.', - 'q.b.d.', 'q.e.d.', 'q.q.', 'q.r.', 'r.', 'r.a.b.g.', - 'r.a.c.e.', 'r.a.j.b.', 'r.b.d.c.', 'r.b.d.i.', 'r.b.s.s.', - 'r.c.', 'r.c.b.', 'r.c.d.c.', 'r.c.j.b.', 'r.c.s.j.', - 'r.cass.', 'r.d.c.', 'r.d.i.', 'r.d.i.d.c.', 'r.d.j.b.', - 'r.d.j.p.', 'r.d.p.c.', 'r.d.s.', 'r.d.t.i.', 'r.e.', - 'r.f.s.v.p.', 'r.g.a.r.', 'r.g.c.f.', 'r.g.d.c.', 'r.g.f.', - 'r.g.z.', 'r.h.a.', 'r.i.c.', 'r.i.d.a.', 'r.i.e.j.', - 'r.i.n.', 'r.i.s.a.', 'r.j.d.a.', 'r.j.i.', 'r.k.', 'r.l.', - 'r.l.g.b.', 'r.med.', 'r.med.rechtspr.', 'r.n.b.', 'r.o.', - 'r.ov.', 'r.p.', 'r.p.d.b.', 'r.p.o.t.', 'r.p.r.j.', - 'r.p.s.', 'r.r.d.', 'r.r.s.', 'r.s.', 'r.s.v.p.', - 'r.stvb.', 'r.t.d.f.', 'r.t.d.h.', 'r.t.l.', - 'r.trim.dr.eur.', 'r.v.a.', 'r.verkb.', 'r.w.', 'r.w.d.', - 'rap.ann.c.a.', 'rap.ann.c.c.', 'rap.ann.c.e.', - 'rap.ann.c.s.j.', 'rap.ann.ca.', 'rap.ann.cass.', - 'rap.ann.cc.', 'rap.ann.ce.', 'rap.ann.csj.', 'rapp.', - 'rb.', 'rb.kh.', 'rdn.', 'rdnr.', 're.pers.', 'rec.', - 'rec.c.i.j.', 'rec.c.j.c.e.', 'rec.cij.', 'rec.cjce.', - 'rec.gén.enr.not.', 'rechtsk.t.', 'rechtspl.zeem.', - 'rechtspr.arb.br.', 'rechtspr.b.f.e.', 'rechtspr.bfe.', - 'rechtspr.soc.r.b.l.n.', 'recl.reg.', 'rect.', 'red.', - 'reg.', 'reg.huiz.bew.', 'reg.w.', 'registr.w.', 'regl.', - 'regl.', 'r.v.k.', 'regl.besl.', 'regl.onderr.', - 'regl.r.t.', 'rep.', 'rép.fisc.', 'rép.not.', 'rep.r.j.', - 'rep.rj.', 'req.', 'res.', 'resp.', 'rev.', 'rev.', - 'comp.', 'rev.', 'trim.', 'civ.', 'rev.', 'trim.', 'comm.', - 'rev.acc.trav.', 'rev.adm.', 'rev.b.compt.', - 'rev.b.dr.const.', 'rev.b.dr.intern.', 'rev.b.séc.soc.', - 'rev.banc.fin.', 'rev.comm.', 'rev.cons.prud.', - 'rev.dr.b.', 'rev.dr.commun.', 'rev.dr.étr.', - 'rev.dr.fam.', 'rev.dr.intern.comp.', 'rev.dr.mil.', - 'rev.dr.min.', 'rev.dr.pén.', 'rev.dr.pén.mil.', - 'rev.dr.rur.', 'rev.dr.u.l.b.', 'rev.dr.ulb.', 'rev.exp.', - 'rev.faill.', 'rev.fisc.', 'rev.gd.', 'rev.hist.dr.', - 'rev.i.p.c.', 'rev.ipc.', 'rev.not.b.', - 'rev.prat.dr.comm.', 'rev.prat.not.b.', 'rev.prat.soc.', - 'rev.rec.', 'rev.rw.', 'rev.trav.', 'rev.trim.d.h.', - 'rev.trim.dr.fam.', 'rev.urb.', 'richtl.', 'riv.dir.int.', - 'riv.dir.int.priv.proc.', 'rk.', 'rln.', 'roln.', 'rom.', - 'rondz.', 'rov.', 'rtl.', 'rubr.', 'ruilv.wet.', - 'rv.verdr.', 'rvkb.', 's.', 's.', 's.a.', 's.b.n.', - 's.ct.', 's.d.', 's.e.c.', 's.e.et.o.', 's.e.w.', - 's.exec.rept.', 's.hrg.', 's.j.b.', 's.l.', 's.l.e.a.', - 's.l.n.d.', 's.p.a.', 's.s.', 's.t.', 's.t.b.', 's.v.', - 's.v.p.', 'samenw.', 'sc.', 'sch.', 'scheidsr.uitspr.', - 'schepel.besl.', 'secr.comm.', 'secr.gen.', 'sect.soc.', - 'sess.', 'cas.', 'sir.', 'soc.', 'best.', 'soc.', 'handv.', - 'soc.', 'verz.', 'soc.act.', 'soc.best.', 'soc.kron.', - 'soc.r.', 'soc.sw.', 'soc.weg.', 'sofi-nr.', 'somm.', - 'somm.ann.', 'sp.c.c.', 'sr.', 'ss.', 'st.doc.b.c.n.a.r.', - 'st.doc.bcnar.', 'st.vw.', 'stagever.', 'stas.', 'stat.', - 'stb.', 'stbl.', 'stcrt.', 'stud.dipl.', 'su.', 'subs.', - 'subst.', 'succ.w.', 'suppl.', 'sv.', 'sw.', 't.', 't.a.', - 't.a.a.', 't.a.n.', 't.a.p.', 't.a.s.n.', 't.a.v.', - 't.a.v.w.', 't.aann.', 't.acc.', 't.agr.r.', 't.app.', - 't.b.b.r.', 't.b.h.', 't.b.m.', 't.b.o.', 't.b.p.', - 't.b.r.', 't.b.s.', 't.b.v.', 't.bankw.', 't.belg.not.', - 't.desk.', 't.e.m.', 't.e.p.', 't.f.r.', 't.fam.', - 't.fin.r.', 't.g.r.', 't.g.t.', 't.g.v.', 't.gem.', - 't.gez.', 't.huur.', 't.i.n.', 't.j.k.', 't.l.l.', - 't.l.v.', 't.m.', 't.m.r.', 't.m.w.', 't.mil.r.', - 't.mil.strafr.', 't.not.', 't.o.', 't.o.r.b.', 't.o.v.', - 't.ontv.', 't.p.r.', 't.pol.', 't.r.', 't.r.g.', - 't.r.o.s.', 't.r.v.', 't.s.r.', 't.strafr.', 't.t.', - 't.u.', 't.v.c.', 't.v.g.', 't.v.m.r.', 't.v.o.', 't.v.v.', - 't.v.v.d.b.', 't.v.w.', 't.verz.', 't.vred.', 't.vreemd.', - 't.w.', 't.w.k.', 't.w.v.', 't.w.v.r.', 't.wrr.', 't.z.', - 't.z.t.', 't.z.v.', 'taalk.', 'tar.burg.z.', 'td.', - 'techn.', 'telecomm.', 'toel.', 'toel.st.v.w.', 'toep.', - 'toep.regl.', 'tom.', 'top.', 'trans.b.', 'transp.r.', - 'trb.', 'trib.', 'trib.civ.', 'trib.gr.inst.', 'ts.', - 'ts.', 'best.', 'ts.', 'verv.', 'turnh.rechtsl.', 'tvpol.', - 'tvpr.', 'tvrechtsgesch.', 'tw.', 'u.', 'u.a.', 'u.a.r.', - 'u.a.v.', 'u.c.', 'u.c.c.', 'u.g.', 'u.p.', 'u.s.', - 'u.s.d.c.', 'uitdr.', 'uitl.w.', 'uitv.besch.div.b.', - 'uitv.besl.', 'uitv.besl.', 'succ.w.', 'uitv.besl.bel.rv.', - 'uitv.besl.l.b.', 'uitv.reg.', 'inv.w.', 'uitv.reg.bel.d.', - 'uitv.reg.afd.verm.', 'uitv.reg.lb.', 'uitv.reg.succ.w.', - 'univ.', 'univ.verkl.', 'v.', 'v.', 'chr.', 'v.a.', - 'v.a.v.', 'v.c.', 'v.chr.', 'v.h.', 'v.huw.verm.', 'v.i.', - 'v.i.o.', 'v.k.a.', 'v.m.', 'v.o.f.', 'v.o.n.', - 'v.onderh.verpl.', 'v.p.', 'v.r.', 'v.s.o.', 'v.t.t.', - 'v.t.t.t.', 'v.tk.t.', 'v.toep.r.vert.', 'v.v.b.', - 'v.v.g.', 'v.v.t.', 'v.v.t.t.', 'v.v.tk.t.', 'v.w.b.', - 'v.z.m.', 'vb.', 'vb.bo.', 'vbb.', 'vc.', 'vd.', 'veldw.', - 'ver.k.', 'ver.verg.gem.', 'gem.comm.', 'verbr.', 'verd.', - 'verdr.', 'verdr.v.', 'tek.mod.', 'verenw.', 'verg.', - 'verg.fr.gem.', 'comm.', 'verkl.', 'verkl.herz.gw.', - 'verl.', 'deelw.', 'vern.', 'verord.', 'vers.r.', - 'versch.', 'versl.c.s.w.', 'versl.csw.', 'vert.', 'verw.', - 'verz.', 'verz.w.', 'verz.wett.besl.', - 'verz.wett.decr.besl.', 'vgl.', 'vid.', 'viss.w.', - 'vl.parl.', 'vl.r.', 'vl.t.gez.', 'vl.w.reg.', - 'vl.w.succ.', 'vlg.', 'vn.', 'vnl.', 'vnw.', 'vo.', - 'vo.bl.', 'voegw.', 'vol.', 'volg.', 'volt.', 'deelw.', - 'voorl.', 'voorz.', 'vord.w.', 'vorst.d.', 'vr.', 'vred.', - 'vrg.', 'vnw.', 'vrijgrs.', 'vs.', 'vt.', 'vw.', 'vz.', - 'vzngr.', 'vzr.', 'w.', 'w.a.', 'w.b.r.', 'w.c.h.', - 'w.conf.huw.', 'w.conf.huwelijksb.', 'w.consum.kr.', - 'w.f.r.', 'w.g.', 'w.gew.r.', 'w.ident.pl.', 'w.just.doc.', - 'w.kh.', 'w.l.r.', 'w.l.v.', 'w.mil.straf.spr.', 'w.n.', - 'w.not.ambt.', 'w.o.', 'w.o.d.huurcomm.', 'w.o.d.k.', - 'w.openb.manif.', 'w.parl.', 'w.r.', 'w.reg.', 'w.succ.', - 'w.u.b.', 'w.uitv.pl.verord.', 'w.v.', 'w.v.k.', - 'w.v.m.s.', 'w.v.r.', 'w.v.w.', 'w.venn.', 'wac.', 'wd.', - 'wetb.', 'n.v.h.', 'wgb.', 'winkelt.w.', 'wisk.', - 'wka-verkl.', 'wnd.', 'won.w.', 'woningw.', 'woonr.w.', - 'wrr.', 'wrr.ber.', 'wrsch.', 'ws.', 'wsch.', 'wsr.', - 'wtvb.', 'ww.', 'x.d.', 'z.a.', 'z.g.', 'z.i.', 'z.j.', - 'z.o.z.', 'z.p.', 'z.s.m.', 'zg.', 'zgn.', 'zn.', 'znw.', - 'zr.', 'zr.', 'ms.', 'zr.ms.'] -# fmt: on +abbrevs = [ + "a.2d.", + "a.a.", + "a.a.j.b.", + "a.f.t.", + "a.g.j.b.", + "a.h.v.", + "a.h.w.", + "a.hosp.", + "a.i.", + "a.j.b.", + "a.j.t.", + "a.m.", + "a.m.r.", + "a.p.m.", + "a.p.r.", + "a.p.t.", + "a.s.", + "a.t.d.f.", + "a.u.b.", + "a.v.a.", + "a.w.", + "aanbev.", + "aanbev.comm.", + "aant.", + "aanv.st.", + "aanw.", + "vnw.", + "aanw.vnw.", + "abd.", + "abm.", + "abs.", + "acc.act.", + "acc.bedr.m.", + "acc.bedr.t.", + "achterv.", + "act.dr.", + "act.dr.fam.", + "act.fisc.", + "act.soc.", + "adm.akk.", + "adm.besl.", + "adm.lex.", + "adm.onderr.", + "adm.ov.", + "adv.", + "adv.", + "gen.", + "adv.bl.", + "afd.", + "afl.", + "aggl.verord.", + "agr.", + "al.", + "alg.", + "alg.richts.", + "amén.", + "ann.dr.", + "ann.dr.lg.", + "ann.dr.sc.pol.", + "ann.ét.eur.", + "ann.fac.dr.lg.", + "ann.jur.créd.", + "ann.jur.créd.règl.coll.", + "ann.not.", + "ann.parl.", + "ann.prat.comm.", + "app.", + "arb.", + "aud.", + "arbbl.", + "arbh.", + "arbit.besl.", + "arbrb.", + "arr.", + "arr.cass.", + "arr.r.v.st.", + "arr.verbr.", + "arrondrb.", + "art.", + "artw.", + "aud.", + "b.", + "b.", + "b.&w.", + "b.a.", + "b.a.s.", + "b.b.o.", + "b.best.dep.", + "b.br.ex.", + "b.coll.fr.gem.comm.", + "b.coll.vl.gem.comm.", + "b.d.cult.r.", + "b.d.gem.ex.", + "b.d.gem.reg.", + "b.dep.", + "b.e.b.", + "b.f.r.", + "b.fr.gem.ex.", + "b.fr.gem.reg.", + "b.i.h.", + "b.inl.j.d.", + "b.inl.s.reg.", + "b.j.", + "b.l.", + "b.o.z.", + "b.prov.r.", + "b.r.h.", + "b.s.", + "b.sr.", + "b.stb.", + "b.t.i.r.", + "b.t.s.z.", + "b.t.w.rev.", + "b.v.", + "b.ver.coll.gem.gem.comm.", + "b.verg.r.b.", + "b.versl.", + "b.vl.ex.", + "b.voorl.reg.", + "b.w.", + "b.w.gew.ex.", + "b.z.d.g.", + "b.z.v.", + "bab.", + "bedr.org.", + "begins.", + "beheersov.", + "bekendm.comm.", + "bel.", + "bel.besch.", + "bel.w.p.", + "beleidsov.", + "belg.", + "grondw.", + "ber.", + "ber.w.", + "besch.", + "besl.", + "beslagr.", + "bestuurswet.", + "bet.", + "betr.", + "betr.", + "vnw.", + "bevest.", + "bew.", + "bijbl.", + "ind.", + "eig.", + "bijbl.n.bijdr.", + "bijl.", + "bijv.", + "bijw.", + "bijz.decr.", + "bin.b.", + "bkh.", + "bl.", + "blz.", + "bm.", + "bn.", + "rh.", + "bnw.", + "bouwr.", + "br.parl.", + "bs.", + "bull.", + "bull.adm.pénit.", + "bull.ass.", + "bull.b.m.m.", + "bull.bel.", + "bull.best.strafinr.", + "bull.bmm.", + "bull.c.b.n.", + "bull.c.n.c.", + "bull.cbn.", + "bull.centr.arb.", + "bull.cnc.", + "bull.contr.", + "bull.doc.min.fin.", + "bull.f.e.b.", + "bull.feb.", + "bull.fisc.fin.r.", + "bull.i.u.m.", + "bull.inf.ass.secr.soc.", + "bull.inf.i.e.c.", + "bull.inf.i.n.a.m.i.", + "bull.inf.i.r.e.", + "bull.inf.iec.", + "bull.inf.inami.", + "bull.inf.ire.", + "bull.inst.arb.", + "bull.ium.", + "bull.jur.imm.", + "bull.lég.b.", + "bull.off.", + "bull.trim.b.dr.comp.", + "bull.us.", + "bull.v.b.o.", + "bull.vbo.", + "bv.", + "bw.", + "bxh.", + "byz.", + "c.", + "c.a.", + "c.a.-a.", + "c.a.b.g.", + "c.c.", + "c.c.i.", + "c.c.s.", + "c.conc.jur.", + "c.d.e.", + "c.d.p.k.", + "c.e.", + "c.ex.", + "c.f.", + "c.h.a.", + "c.i.f.", + "c.i.f.i.c.", + "c.j.", + "c.l.", + "c.n.", + "c.o.d.", + "c.p.", + "c.pr.civ.", + "c.q.", + "c.r.", + "c.r.a.", + "c.s.", + "c.s.a.", + "c.s.q.n.", + "c.v.", + "c.v.a.", + "c.v.o.", + "ca.", + "cadeaust.", + "cah.const.", + "cah.dr.europ.", + "cah.dr.immo.", + "cah.dr.jud.", + "cal.", + "2d.", + "cal.", + "3e.", + "cal.", + "rprt.", + "cap.", + "carg.", + "cass.", + "cass.", + "verw.", + "cert.", + "cf.", + "ch.", + "chron.", + "chron.d.s.", + "chron.dr.not.", + "cie.", + "cie.", + "verz.schr.", + "cir.", + "circ.", + "circ.z.", + "cit.", + "cit.loc.", + "civ.", + "cl.et.b.", + "cmt.", + "co.", + "cognoss.v.", + "coll.", + "v.", + "b.", + "colp.w.", + "com.", + "com.", + "cas.", + "com.v.min.", + "comm.", + "comm.", + "v.", + "comm.bijz.ov.", + "comm.erf.", + "comm.fin.", + "comm.ger.", + "comm.handel.", + "comm.pers.", + "comm.pub.", + "comm.straf.", + "comm.v.", + "comm.venn.", + "comm.verz.", + "comm.voor.", + "comp.", + "compt.w.", + "computerr.", + "con.m.", + "concl.", + "concr.", + "conf.", + "confl.w.", + "confl.w.huwbetr.", + "cons.", + "conv.", + "coöp.", + "ver.", + "corr.", + "corr.bl.", + "cour.fisc.", + "cour.immo.", + "cridon.", + "crim.", + "cur.", + "cur.", + "crt.", + "curs.", + "d.", + "d.-g.", + "d.a.", + "d.a.v.", + "d.b.f.", + "d.c.", + "d.c.c.r.", + "d.d.", + "d.d.p.", + "d.e.t.", + "d.gem.r.", + "d.h.", + "d.h.z.", + "d.i.", + "d.i.t.", + "d.j.", + "d.l.r.", + "d.m.", + "d.m.v.", + "d.o.v.", + "d.parl.", + "d.w.z.", + "dact.", + "dat.", + "dbesch.", + "dbesl.", + "dec.", + "decr.", + "decr.d.", + "decr.fr.", + "decr.vl.", + "decr.w.", + "def.", + "dep.opv.", + "dep.rtl.", + "derg.", + "desp.", + "det.mag.", + "deurw.regl.", + "dez.", + "dgl.", + "dhr.", + "disp.", + "diss.", + "div.", + "div.act.", + "div.bel.", + "dl.", + "dln.", + "dnotz.", + "doc.", + "hist.", + "doc.jur.b.", + "doc.min.fin.", + "doc.parl.", + "doctr.", + "dpl.", + "dpl.besl.", + "dr.", + "dr.banc.fin.", + "dr.circ.", + "dr.inform.", + "dr.mr.", + "dr.pén.entr.", + "dr.q.m.", + "drs.", + "ds.", + "dtp.", + "dwz.", + "dyn.", + "e.", + "e.a.", + "e.b.", + "tek.mod.", + "e.c.", + "e.c.a.", + "e.d.", + "e.e.", + "e.e.a.", + "e.e.g.", + "e.g.", + "e.g.a.", + "e.h.a.", + "e.i.", + "e.j.", + "e.m.a.", + "e.n.a.c.", + "e.o.", + "e.p.c.", + "e.r.c.", + "e.r.f.", + "e.r.h.", + "e.r.o.", + "e.r.p.", + "e.r.v.", + "e.s.r.a.", + "e.s.t.", + "e.v.", + "e.v.a.", + "e.w.", + "e&o.e.", + "ec.pol.r.", + "econ.", + "ed.", + "ed(s).", + "eff.", + "eig.", + "eig.mag.", + "eil.", + "elektr.", + "enmb.", + "enz.", + "err.", + "etc.", + "etq.", + "eur.", + "parl.", + "eur.t.s.", + "ev.", + "evt.", + "ex.", + "ex.crim.", + "exec.", + "f.", + "f.a.o.", + "f.a.q.", + "f.a.s.", + "f.i.b.", + "f.j.f.", + "f.o.b.", + "f.o.r.", + "f.o.s.", + "f.o.t.", + "f.r.", + "f.supp.", + "f.suppl.", + "fa.", + "facs.", + "fasc.", + "fg.", + "fid.ber.", + "fig.", + "fin.verh.w.", + "fisc.", + "fisc.", + "tijdschr.", + "fisc.act.", + "fisc.koer.", + "fl.", + "form.", + "foro.", + "it.", + "fr.", + "fr.cult.r.", + "fr.gem.r.", + "fr.parl.", + "fra.", + "ft.", + "g.", + "g.a.", + "g.a.v.", + "g.a.w.v.", + "g.g.d.", + "g.m.t.", + "g.o.", + "g.omt.e.", + "g.p.", + "g.s.", + "g.v.", + "g.w.w.", + "geb.", + "gebr.", + "gebrs.", + "gec.", + "gec.decr.", + "ged.", + "ged.st.", + "gedipl.", + "gedr.st.", + "geh.", + "gem.", + "gem.", + "gem.", + "gem.gem.comm.", + "gem.st.", + "gem.stem.", + "gem.w.", + "gemeensch.optr.", + "gemeensch.standp.", + "gemeensch.strat.", + "gemeent.", + "gemeent.b.", + "gemeent.regl.", + "gemeent.verord.", + "geol.", + "geopp.", + "gepubl.", + "ger.deurw.", + "ger.w.", + "gerekw.", + "gereq.", + "gesch.", + "get.", + "getr.", + "gev.m.", + "gev.maatr.", + "gew.", + "ghert.", + "gir.eff.verk.", + "gk.", + "gr.", + "gramm.", + "grat.w.", + "grootb.w.", + "grs.", + "grvm.", + "grw.", + "gst.", + "gw.", + "h.a.", + "h.a.v.o.", + "h.b.o.", + "h.e.a.o.", + "h.e.g.a.", + "h.e.geb.", + "h.e.gestr.", + "h.l.", + "h.m.", + "h.o.", + "h.r.", + "h.t.l.", + "h.t.m.", + "h.w.geb.", + "hand.", + "handelsn.w.", + "handelspr.", + "handelsr.w.", + "handelsreg.w.", + "handv.", + "harv.l.rev.", + "hc.", + "herald.", + "hert.", + "herz.", + "hfdst.", + "hfst.", + "hgrw.", + "hhr.", + "hist.", + "hooggel.", + "hoogl.", + "hosp.", + "hpw.", + "hr.", + "hr.", + "ms.", + "hr.ms.", + "hregw.", + "hrg.", + "hst.", + "huis.just.", + "huisv.w.", + "huurbl.", + "hv.vn.", + "hw.", + "hyp.w.", + "i.b.s.", + "i.c.", + "i.c.m.h.", + "i.e.", + "i.f.", + "i.f.p.", + "i.g.v.", + "i.h.", + "i.h.a.", + "i.h.b.", + "i.l.pr.", + "i.o.", + "i.p.o.", + "i.p.r.", + "i.p.v.", + "i.pl.v.", + "i.r.d.i.", + "i.s.m.", + "i.t.t.", + "i.v.", + "i.v.m.", + "i.v.s.", + "i.w.tr.", + "i.z.", + "ib.", + "ibid.", + "icip-ing.cons.", + "iem.", + "inc.", + "indic.soc.", + "indiv.", + "inf.", + "inf.i.d.a.c.", + "inf.idac.", + "inf.r.i.z.i.v.", + "inf.riziv.", + "inf.soc.secr.", + "ing.", + "ing.", + "cons.", + "ing.cons.", + "inst.", + "int.", + "int.", + "rechtsh.", + "strafz.", + "interm.", + "intern.fisc.act.", + "intern.vervoerr.", + "inv.", + "inv.", + "f.", + "inv.w.", + "inv.wet.", + "invord.w.", + "inz.", + "ir.", + "irspr.", + "iwtr.", + "j.", + "j.-cl.", + "j.c.b.", + "j.c.e.", + "j.c.fl.", + "j.c.j.", + "j.c.p.", + "j.d.e.", + "j.d.f.", + "j.d.s.c.", + "j.dr.jeun.", + "j.j.d.", + "j.j.p.", + "j.j.pol.", + "j.l.", + "j.l.m.b.", + "j.l.o.", + "j.p.a.", + "j.r.s.", + "j.t.", + "j.t.d.e.", + "j.t.dr.eur.", + "j.t.o.", + "j.t.t.", + "jaarl.", + "jb.hand.", + "jb.kred.", + "jb.kred.c.s.", + "jb.l.r.b.", + "jb.lrb.", + "jb.markt.", + "jb.mens.", + "jb.t.r.d.", + "jb.trd.", + "jeugdrb.", + "jeugdwerkg.w.", + "jhr.", + "jg.", + "jis.", + "jl.", + "journ.jur.", + "journ.prat.dr.fisc.fin.", + "journ.proc.", + "jr.", + "jrg.", + "jur.", + "jur.comm.fl.", + "jur.dr.soc.b.l.n.", + "jur.f.p.e.", + "jur.fpe.", + "jur.niv.", + "jur.trav.brux.", + "jurambt.", + "jv.cass.", + "jv.h.r.j.", + "jv.hrj.", + "jw.", + "k.", + "k.", + "k.b.", + "k.g.", + "k.k.", + "k.m.b.o.", + "k.o.o.", + "k.v.k.", + "k.v.v.v.", + "kadasterw.", + "kaderb.", + "kador.", + "kbo-nr.", + "kg.", + "kh.", + "kiesw.", + "kind.bes.v.", + "kkr.", + "kon.", + "koopv.", + "kr.", + "krankz.w.", + "ksbel.", + "kt.", + "ktg.", + "ktr.", + "kvdm.", + "kw.r.", + "kymr.", + "kzr.", + "kzw.", + "l.", + "l.b.", + "l.b.o.", + "l.bas.", + "l.c.", + "l.gew.", + "l.j.", + "l.k.", + "l.l.", + "l.o.", + "l.p.", + "l.r.b.", + "l.u.v.i.", + "l.v.r.", + "l.v.w.", + "l.w.", + "l'exp.-compt.b..", + "l’exp.-compt.b.", + "landinr.w.", + "landscrt.", + "lat.", + "law.ed.", + "lett.", + "levensverz.", + "lgrs.", + "lidw.", + "limb.rechtsl.", + "lit.", + "litt.", + "liw.", + "liwet.", + "lk.", + "ll.", + "ll.(l.)l.r.", + "loonw.", + "losbl.", + "ltd.", + "luchtv.", + "luchtv.w.", + "m.", + "m.", + "not.", + "m.a.v.o.", + "m.a.w.", + "m.b.", + "m.b.o.", + "m.b.r.", + "m.b.t.", + "m.d.g.o.", + "m.e.a.o.", + "m.e.r.", + "m.h.", + "m.h.d.", + "m.i.v.", + "m.j.t.", + "m.k.", + "m.m.", + "m.m.a.", + "m.m.h.h.", + "m.m.v.", + "m.n.", + "m.not.fisc.", + "m.nt.", + "m.o.", + "m.r.", + "m.s.a.", + "m.u.p.", + "m.v.a.", + "m.v.h.n.", + "m.v.t.", + "m.z.", + "maatr.teboekgest.luchtv.", + "maced.", + "mand.", + "max.", + "mbl.not.", + "me.", + "med.", + "med.", + "v.b.o.", + "med.b.u.f.r.", + "med.bufr.", + "med.vbo.", + "meerv.", + "meetbr.w.", + "mej.", + "mevr.", + "mém.adm.", + "mgr.", + "mgrs.", + "mhd.", + "mi.verantw.", + "mil.", + "mil.bed.", + "mil.ger.", + "min.", + "min.", + "aanbev.", + "min.", + "circ.", + "min.", + "fin.", + "min.j.omz.", + "min.just.circ.", + "mitt.", + "mln.", + "mnd.", + "mod.", + "mon.", + "mouv.comm.", + "mr.", + "ms.", + "muz.", + "mv.", + "n.", + "chr.", + "n.a.", + "n.a.g.", + "n.a.v.", + "n.b.", + "n.c.", + "n.chr.", + "n.d.", + "n.d.r.", + "n.e.a.", + "n.g.", + "n.h.b.c.", + "n.j.", + "n.j.b.", + "n.j.w.", + "n.l.", + "n.m.", + "n.m.m.", + "n.n.", + "n.n.b.", + "n.n.g.", + "n.n.k.", + "n.o.m.", + "n.o.t.k.", + "n.rapp.", + "n.tijd.pol.", + "n.v.", + "n.v.d.r.", + "n.v.d.v.", + "n.v.o.b.", + "n.v.t.", + "nat.besch.w.", + "nat.omb.", + "nat.pers.", + "ned.", + "ned.cult.r.", + "neg.verkl.", + "nhd.", + "wisk.", + "njcm-bull.", + "nl.", + "nnd.", + "no.", + "not.fisc.m.", + "not.w.", + "not.wet.", + "nr.", + "nrs.", + "nste.", + "nt.", + "numism.", + "o.", + "o.a.", + "o.b.", + "o.c.", + "o.g.", + "o.g.v.", + "o.i.", + "o.i.d.", + "o.m.", + "o.o.", + "o.o.d.", + "o.o.v.", + "o.p.", + "o.r.", + "o.regl.", + "o.s.", + "o.t.s.", + "o.t.t.", + "o.t.t.t.", + "o.t.t.z.", + "o.tk.t.", + "o.v.t.", + "o.v.t.t.", + "o.v.tk.t.", + "o.v.v.", + "ob.", + "obsv.", + "octr.", + "octr.gem.regl.", + "octr.regl.", + "oe.", + "off.pol.", + "ofra.", + "ohd.", + "omb.", + "omnil.", + "omz.", + "on.ww.", + "onderr.", + "onfrank.", + "onteig.w.", + "ontw.", + "b.w.", + "onuitg.", + "onz.", + "oorl.w.", + "op.cit.", + "opin.pa.", + "opm.", + "or.", + "ord.br.", + "ord.gem.", + "ors.", + "orth.", + "os.", + "osm.", + "ov.", + "ov.w.i.", + "ov.w.ii.", + "ov.ww.", + "overg.w.", + "overw.", + "ovkst.", + "oz.", + "p.", + "p.a.", + "p.a.o.", + "p.b.o.", + "p.e.", + "p.g.", + "p.j.", + "p.m.", + "p.m.a.", + "p.o.", + "p.o.j.t.", + "p.p.", + "p.v.", + "p.v.s.", + "pachtw.", + "pag.", + "pan.", + "pand.b.", + "pand.pér.", + "parl.gesch.", + "parl.gesch.", + "inv.", + "parl.st.", + "part.arb.", + "pas.", + "pasin.", + "pat.", + "pb.c.", + "pb.l.", + "pct.", + "pens.", + "pensioenverz.", + "per.ber.i.b.r.", + "per.ber.ibr.", + "pers.", + "st.", + "pft.", + "pk.", + "pktg.", + "plv.", + "po.", + "pol.", + "pol.off.", + "pol.r.", + "pol.w.", + "postbankw.", + "postw.", + "pp.", + "pr.", + "preadv.", + "pres.", + "prf.", + "prft.", + "prg.", + "prijz.w.", + "proc.", + "procesregl.", + "prof.", + "prot.", + "prov.", + "prov.b.", + "prov.instr.h.m.g.", + "prov.regl.", + "prov.verord.", + "prov.w.", + "publ.", + "pun.", + "pw.", + "q.b.d.", + "q.e.d.", + "q.q.", + "q.r.", + "r.", + "r.a.b.g.", + "r.a.c.e.", + "r.a.j.b.", + "r.b.d.c.", + "r.b.d.i.", + "r.b.s.s.", + "r.c.", + "r.c.b.", + "r.c.d.c.", + "r.c.j.b.", + "r.c.s.j.", + "r.cass.", + "r.d.c.", + "r.d.i.", + "r.d.i.d.c.", + "r.d.j.b.", + "r.d.j.p.", + "r.d.p.c.", + "r.d.s.", + "r.d.t.i.", + "r.e.", + "r.f.s.v.p.", + "r.g.a.r.", + "r.g.c.f.", + "r.g.d.c.", + "r.g.f.", + "r.g.z.", + "r.h.a.", + "r.i.c.", + "r.i.d.a.", + "r.i.e.j.", + "r.i.n.", + "r.i.s.a.", + "r.j.d.a.", + "r.j.i.", + "r.k.", + "r.l.", + "r.l.g.b.", + "r.med.", + "r.med.rechtspr.", + "r.n.b.", + "r.o.", + "r.ov.", + "r.p.", + "r.p.d.b.", + "r.p.o.t.", + "r.p.r.j.", + "r.p.s.", + "r.r.d.", + "r.r.s.", + "r.s.", + "r.s.v.p.", + "r.stvb.", + "r.t.d.f.", + "r.t.d.h.", + "r.t.l.", + "r.trim.dr.eur.", + "r.v.a.", + "r.verkb.", + "r.w.", + "r.w.d.", + "rap.ann.c.a.", + "rap.ann.c.c.", + "rap.ann.c.e.", + "rap.ann.c.s.j.", + "rap.ann.ca.", + "rap.ann.cass.", + "rap.ann.cc.", + "rap.ann.ce.", + "rap.ann.csj.", + "rapp.", + "rb.", + "rb.kh.", + "rdn.", + "rdnr.", + "re.pers.", + "rec.", + "rec.c.i.j.", + "rec.c.j.c.e.", + "rec.cij.", + "rec.cjce.", + "rec.gén.enr.not.", + "rechtsk.t.", + "rechtspl.zeem.", + "rechtspr.arb.br.", + "rechtspr.b.f.e.", + "rechtspr.bfe.", + "rechtspr.soc.r.b.l.n.", + "recl.reg.", + "rect.", + "red.", + "reg.", + "reg.huiz.bew.", + "reg.w.", + "registr.w.", + "regl.", + "regl.", + "r.v.k.", + "regl.besl.", + "regl.onderr.", + "regl.r.t.", + "rep.", + "rép.fisc.", + "rép.not.", + "rep.r.j.", + "rep.rj.", + "req.", + "res.", + "resp.", + "rev.", + "rev.", + "comp.", + "rev.", + "trim.", + "civ.", + "rev.", + "trim.", + "comm.", + "rev.acc.trav.", + "rev.adm.", + "rev.b.compt.", + "rev.b.dr.const.", + "rev.b.dr.intern.", + "rev.b.séc.soc.", + "rev.banc.fin.", + "rev.comm.", + "rev.cons.prud.", + "rev.dr.b.", + "rev.dr.commun.", + "rev.dr.étr.", + "rev.dr.fam.", + "rev.dr.intern.comp.", + "rev.dr.mil.", + "rev.dr.min.", + "rev.dr.pén.", + "rev.dr.pén.mil.", + "rev.dr.rur.", + "rev.dr.u.l.b.", + "rev.dr.ulb.", + "rev.exp.", + "rev.faill.", + "rev.fisc.", + "rev.gd.", + "rev.hist.dr.", + "rev.i.p.c.", + "rev.ipc.", + "rev.not.b.", + "rev.prat.dr.comm.", + "rev.prat.not.b.", + "rev.prat.soc.", + "rev.rec.", + "rev.rw.", + "rev.trav.", + "rev.trim.d.h.", + "rev.trim.dr.fam.", + "rev.urb.", + "richtl.", + "riv.dir.int.", + "riv.dir.int.priv.proc.", + "rk.", + "rln.", + "roln.", + "rom.", + "rondz.", + "rov.", + "rtl.", + "rubr.", + "ruilv.wet.", + "rv.verdr.", + "rvkb.", + "s.", + "s.", + "s.a.", + "s.b.n.", + "s.ct.", + "s.d.", + "s.e.c.", + "s.e.et.o.", + "s.e.w.", + "s.exec.rept.", + "s.hrg.", + "s.j.b.", + "s.l.", + "s.l.e.a.", + "s.l.n.d.", + "s.p.a.", + "s.s.", + "s.t.", + "s.t.b.", + "s.v.", + "s.v.p.", + "samenw.", + "sc.", + "sch.", + "scheidsr.uitspr.", + "schepel.besl.", + "sec.", + "secr.comm.", + "secr.gen.", + "sect.soc.", + "sess.", + "cas.", + "sir.", + "soc.", + "best.", + "soc.", + "handv.", + "soc.", + "verz.", + "soc.act.", + "soc.best.", + "soc.kron.", + "soc.r.", + "soc.sw.", + "soc.weg.", + "sofi-nr.", + "somm.", + "somm.ann.", + "sp.c.c.", + "sr.", + "ss.", + "st.doc.b.c.n.a.r.", + "st.doc.bcnar.", + "st.vw.", + "stagever.", + "stas.", + "stat.", + "stb.", + "stbl.", + "stcrt.", + "stud.dipl.", + "su.", + "subs.", + "subst.", + "succ.w.", + "suppl.", + "sv.", + "sw.", + "t.", + "t.a.", + "t.a.a.", + "t.a.n.", + "t.a.p.", + "t.a.s.n.", + "t.a.v.", + "t.a.v.w.", + "t.aann.", + "t.acc.", + "t.agr.r.", + "t.app.", + "t.b.b.r.", + "t.b.h.", + "t.b.m.", + "t.b.o.", + "t.b.p.", + "t.b.r.", + "t.b.s.", + "t.b.v.", + "t.bankw.", + "t.belg.not.", + "t.desk.", + "t.e.m.", + "t.e.p.", + "t.f.r.", + "t.fam.", + "t.fin.r.", + "t.g.r.", + "t.g.t.", + "t.g.v.", + "t.gem.", + "t.gez.", + "t.huur.", + "t.i.n.", + "t.j.k.", + "t.l.l.", + "t.l.v.", + "t.m.", + "t.m.r.", + "t.m.w.", + "t.mil.r.", + "t.mil.strafr.", + "t.not.", + "t.o.", + "t.o.r.b.", + "t.o.v.", + "t.ontv.", + "t.p.r.", + "t.pol.", + "t.r.", + "t.r.g.", + "t.r.o.s.", + "t.r.v.", + "t.s.r.", + "t.strafr.", + "t.t.", + "t.u.", + "t.v.c.", + "t.v.g.", + "t.v.m.r.", + "t.v.o.", + "t.v.v.", + "t.v.v.d.b.", + "t.v.w.", + "t.verz.", + "t.vred.", + "t.vreemd.", + "t.w.", + "t.w.k.", + "t.w.v.", + "t.w.v.r.", + "t.wrr.", + "t.z.", + "t.z.t.", + "t.z.v.", + "taalk.", + "tar.burg.z.", + "td.", + "techn.", + "telecomm.", + "th.", + "toel.", + "toel.st.v.w.", + "toep.", + "toep.regl.", + "tom.", + "top.", + "trans.b.", + "transp.r.", + "trb.", + "trib.", + "trib.civ.", + "trib.gr.inst.", + "ts.", + "ts.", + "best.", + "ts.", + "verv.", + "turnh.rechtsl.", + "tvpol.", + "tvpr.", + "tvrechtsgesch.", + "tw.", + "u.", + "u.a.", + "u.a.r.", + "u.a.v.", + "u.c.", + "u.c.c.", + "u.g.", + "u.p.", + "u.s.", + "u.s.d.c.", + "uitdr.", + "uitl.w.", + "uitv.besch.div.b.", + "uitv.besl.", + "uitv.besl.", + "succ.w.", + "uitv.besl.bel.rv.", + "uitv.besl.l.b.", + "uitv.reg.", + "inv.w.", + "uitv.reg.bel.d.", + "uitv.reg.afd.verm.", + "uitv.reg.lb.", + "uitv.reg.succ.w.", + "univ.", + "univ.verkl.", + "v.", + "v.", + "chr.", + "v.a.", + "v.a.v.", + "v.c.", + "v.C.", + "v.Chr.", + "v.chr.", + "v.d.", + "v.h.", + "v.huw.verm.", + "v.i.", + "v.i.o.", + "v.j.", + "v.k.a.", + "v.m.", + "v.o.f.", + "v.o.n.", + "v.onderh.verpl.", + "v.p.", + "v.r.", + "v.s.o.", + "v.t.t.", + "v.t.t.t.", + "v.tk.t.", + "v.toep.r.vert.", + "v.v.b.", + "v.v.g.", + "v.v.t.", + "v.v.t.t.", + "v.v.tk.t.", + "v.w.b.", + "v.z.m.", + "vb.", + "vb.bo.", + "vbb.", + "vc.", + "vd.", + "veldw.", + "ver.k.", + "ver.verg.gem.", + "gem.comm.", + "verbr.", + "verd.", + "verdr.", + "verdr.v.", + "tek.mod.", + "verenw.", + "verg.", + "verg.fr.gem.", + "comm.", + "verkl.", + "verkl.herz.gw.", + "verl.", + "deelw.", + "vern.", + "verord.", + "vers.r.", + "versch.", + "versl.c.s.w.", + "versl.csw.", + "vert.", + "verw.", + "verz.", + "verz.w.", + "verz.wett.besl.", + "verz.wett.decr.besl.", + "vgl.", + "vid.", + "viss.w.", + "vl.parl.", + "vl.r.", + "vl.t.gez.", + "vl.w.reg.", + "vl.w.succ.", + "vlg.", + "vn.", + "vnl.", + "vnw.", + "vo.", + "vo.bl.", + "voegw.", + "vol.", + "volg.", + "volt.", + "deelw.", + "voorl.", + "voorz.", + "vord.w.", + "vorst.d.", + "vr.", + "vred.", + "vrg.", + "vnw.", + "vrijgrs.", + "vs.", + "vt.", + "vw.", + "vz.", + "vzngr.", + "vzr.", + "w.", + "w.a.", + "w.b.r.", + "w.c.h.", + "w.conf.huw.", + "w.conf.huwelijksb.", + "w.consum.kr.", + "w.f.r.", + "w.g.", + "w.gew.r.", + "w.ident.pl.", + "w.just.doc.", + "w.kh.", + "w.l.r.", + "w.l.v.", + "w.mil.straf.spr.", + "w.n.", + "w.not.ambt.", + "w.o.", + "w.o.d.huurcomm.", + "w.o.d.k.", + "w.openb.manif.", + "w.parl.", + "w.r.", + "w.reg.", + "w.succ.", + "w.u.b.", + "w.uitv.pl.verord.", + "w.v.", + "w.v.k.", + "w.v.m.s.", + "w.v.r.", + "w.v.w.", + "w.venn.", + "wac.", + "wd.", + "wetb.", + "n.v.h.", + "wgb.", + "winkelt.w.", + "wisk.", + "wka-verkl.", + "wnd.", + "won.w.", + "woningw.", + "woonr.w.", + "wrr.", + "wrr.ber.", + "wrsch.", + "ws.", + "wsch.", + "wsr.", + "wtvb.", + "ww.", + "x.d.", + "z.a.", + "z.g.", + "z.i.", + "z.j.", + "z.o.z.", + "z.p.", + "z.s.m.", + "zg.", + "zgn.", + "zn.", + "znw.", + "zr.", + "zr.", + "ms.", + "zr.ms.", + "'m", + "'n", + "'ns", + "'s", + "'t", +] _exc = {} for orth in abbrevs: From e8be15e9b79ba66497b59947c31604b48793bde0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:18:23 +0200 Subject: [PATCH 057/105] Improve tokenization for UD Spanish AnCora (#5253) --- spacy/lang/es/__init__.py | 3 ++ spacy/lang/es/punctuation.py | 48 +++++++++++++++++++++++++++ spacy/lang/es/tokenizer_exceptions.py | 4 ++- 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/es/punctuation.py diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 80cc1727c..249748a17 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -6,6 +6,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults): ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py new file mode 100644 index 000000000..42335237c --- /dev/null +++ b/spacy/lang/es/punctuation.py @@ -0,0 +1,48 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES +from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA +from ..char_classes import merge_chars +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES + + +_list_units = [u for u in LIST_UNITS if u != "%"] +_units = merge_chars(" ".join(_list_units)) +_concat_quotes = CONCAT_QUOTES + "—–" + + +_suffixes = ( + ["—", "–"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 9109d658b..2c2631086 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -43,14 +43,16 @@ for orth in [ "Av.", "Avda.", "Cía.", + "EE.UU.", "etc.", + "fig.", "Gob.", "Gral.", "Ing.", "J.C.", + "km/h", "Lic.", "m.n.", - "no.", "núm.", "P.D.", "Prof.", From c981aa66849f0e19688be746f8ecbe344e7578b7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:19:04 +0200 Subject: [PATCH 058/105] Use inline flags in token_match patterns (#5257) * Use inline flags in token_match patterns Use inline flags in `token_match` patterns so that serializing does not lose the flag information. * Modify inline flag * Modify inline flag --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tokenizer.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index dfcb2756e..cb1702300 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE + "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 385afb8bd..29ce75442 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -58,7 +58,7 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match +TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..62b8bbf4a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -567,7 +567,7 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): From fa760010a556bb15c76bb6a9bf77b6439de3adf0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 7 Apr 2020 12:04:51 +0200 Subject: [PATCH 059/105] Set rank for new vector in Vocab.set_vector (#5266) Set `Lexeme.rank` for vectors added with `Vocab.set_vector` so that the lexeme `ID` accessed by a model points the right row for the new vector. --- spacy/vocab.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3cf0095ee..8f95c567c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -406,9 +406,9 @@ cdef class Vocab: else: width = self.vectors.shape[1] self.vectors.resize((new_rows, width)) - lex = self[orth] # Adds words to vocab - self.vectors.add(orth, vector=vector) - self.vectors.add(orth, vector=vector) + lex = self[orth] # Add word to vocab if necessary + row = self.vectors.add(orth, vector=vector) + lex.rank = row def has_vector(self, orth): """Check whether a word has a vector. Returns False if no vectors have From 7ad0fcf01dd2ebc9fbf9ce5963d16f1f71d2b572 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 8 Apr 2020 12:58:09 +0200 Subject: [PATCH 060/105] fix json (#5267) --- website/meta/universe.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index bbd67e8a6..b5e1dbde0 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -14,8 +14,7 @@ "from whatlies.language import SpacyLanguage", "", "lang = SpacyLanguage('en_core_web_md')", - "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', ', - 'king', 'queen', 'doctor', 'nurse']", + "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', 'king', 'queen', 'doctor', 'nurse']", "", "emb = lang[words]", "emb.plot_interactive(x_axis='man', y_axis='woman')" From ae4af52ce7dd9dda0eb0f1b8eeb0cba7d20facdf Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Apr 2020 12:58:39 +0200 Subject: [PATCH 061/105] Add ideographic stops to sentencizer (#5263) Add ideographic half- and fullwidth full stops to default sentencizer punctuation. --- spacy/pipeline/pipes.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a20c9b6df..f2a86d56e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1444,7 +1444,8 @@ class Sentencizer(object): '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', - '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈'] + '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', + '。', '。'] def __init__(self, punct_chars=None, **kwargs): """Initialize the sentencizer. From cf579a398d121617c0ab684d414af5f067677078 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Apr 2020 20:03:06 +0200 Subject: [PATCH 062/105] Add __init__.py to eu and hy tests (#5278) --- spacy/tests/lang/eu/__init__.py | 0 spacy/tests/lang/hy/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 spacy/tests/lang/eu/__init__.py create mode 100644 spacy/tests/lang/hy/__init__.py diff --git a/spacy/tests/lang/eu/__init__.py b/spacy/tests/lang/eu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/hy/__init__.py b/spacy/tests/lang/hy/__init__.py new file mode 100644 index 000000000..e69de29bb From 8952effcc43b1694a4c0377904667a45b4ed1318 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Thu, 9 Apr 2020 23:46:15 +1000 Subject: [PATCH 063/105] Fixed Typo in Warning (#5284) * Fixed typo in cli warning Fixed a typo in the warning for the provision of exactly two labels, which have not been designated as binary, to textcat. * Create and signed contributor form --- .github/contributors/umarbutler.md | 106 +++++++++++++++++++++++++++++ spacy/cli/train.py | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/umarbutler.md diff --git a/.github/contributors/umarbutler.md b/.github/contributors/umarbutler.md new file mode 100644 index 000000000..8df825152 --- /dev/null +++ b/.github/contributors/umarbutler.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Umar Butler | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-09 | +| GitHub username | umarbutler | +| Website (optional) | https://umarbutler.com | diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c94c26b62..8fc475d24 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -363,7 +363,7 @@ def train( if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " - "exclusive classes, provide '--textcat_positive_label' for " + "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class." ) msg.text( From 6a8a52650fcb3108f534480651c56e83b0f608fd Mon Sep 17 00:00:00 2001 From: Marek Grzenkowicz Date: Sat, 11 Apr 2020 23:35:01 +0200 Subject: [PATCH 064/105] [Closes #5292] Fix typo in option name "--n-save_every" (#5293) * Sign contributor agreement for chopeen * Fix typo in option name and close #5292 --- .github/contributors/chopeen.md | 106 ++++++++++++++++++++++++++++++++ website/docs/api/cli.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/chopeen.md diff --git a/.github/contributors/chopeen.md b/.github/contributors/chopeen.md new file mode 100644 index 000000000..d293c9845 --- /dev/null +++ b/.github/contributors/chopeen.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Marek Grzenkowicz | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020.04.10 | +| GitHub username | chopeen | +| Website (optional) | | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f067ba5a7..7101e3ddc 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -457,7 +457,7 @@ improvement. $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] -[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save_every] +[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every] [--init-tok2vec] [--epoch-start] ``` From a3965ec13da0b470cc45dfa12708a9eb327d6a94 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 14 Apr 2020 14:53:47 +0200 Subject: [PATCH 065/105] tag-map-path since 2.2.4 instead of 2.2.3 (#5289) --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7101e3ddc..15691c4f8 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -189,7 +189,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi | `lang` | positional | Model language. | | `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | | `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | | `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | | `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | From 8ce408d2e1b48ae6bea96934496bcd7507f4d75e Mon Sep 17 00:00:00 2001 From: Paolo Arduin Date: Tue, 14 Apr 2020 19:14:15 +0200 Subject: [PATCH 066/105] Comparison predicate handling for `!=` (#5282) * Fix #5281 * Optim test --- .github/contributors/paoloq.md | 106 ++++++++++++++++++++++++ spacy/matcher/matcher.pyx | 1 + spacy/tests/matcher/test_matcher_api.py | 19 ++++- 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 .github/contributors/paoloq.md diff --git a/.github/contributors/paoloq.md b/.github/contributors/paoloq.md new file mode 100644 index 000000000..84b28c8ef --- /dev/null +++ b/.github/contributors/paoloq.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paolo Arduin | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 9 April 2020 | +| GitHub username | paoloq | +| Website (optional) | | diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 11461afb8..43480b46e 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -782,6 +782,7 @@ def _get_extra_predicates(spec, extra_predicates): "IN": _SetMemberPredicate, "NOT_IN": _SetMemberPredicate, "==": _ComparisonPredicate, + "!=": _ComparisonPredicate, ">=": _ComparisonPredicate, "<=": _ComparisonPredicate, ">": _ComparisonPredicate, diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c0314f3c3..2e5e64aac 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -265,14 +265,25 @@ def test_matcher_regex_shape(en_vocab): assert len(matches) == 0 -def test_matcher_compare_length(en_vocab): +@pytest.mark.parametrize( + "cmp, bad", + [ + ("==", ["a", "aaa"]), + ("!=", ["aa"]), + (">=", ["a"]), + ("<=", ["aaa"]), + (">", ["a", "aa"]), + ("<", ["aa", "aaa"]) + ] +) +def test_matcher_compare_length(en_vocab, cmp, bad): matcher = Matcher(en_vocab) - pattern = [{"LENGTH": {">=": 2}}] + pattern = [{"LENGTH": {cmp: 2}}] matcher.add("LENGTH_COMPARE", [pattern]) doc = Doc(en_vocab, words=["a", "aa", "aaa"]) matches = matcher(doc) - assert len(matches) == 2 - doc = Doc(en_vocab, words=["a"]) + assert len(matches) == len(doc) - len(bad) + doc = Doc(en_vocab, words=bad) matches = matcher(doc) assert len(matches) == 0 From 3d2c308906e2bde7ca57d2e8213252530b944502 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 14 Apr 2020 19:15:52 +0200 Subject: [PATCH 067/105] Add Doc init from list of words and text (#5251) * Add Doc init from list of words and text Add an option to initialize a `Doc` from a text and list of words where the words may or may not include all whitespace tokens. If the text and words are mismatched, raise an error. * Fix error code * Remove all whitespace before aligning words/text * Move words/text init to util function * Update error message * Rename to get_words_and_spaces * Fix formatting --- spacy/errors.py | 1 + spacy/tests/doc/test_creation.py | 39 ++++++++++++++++++++++++++++++++ spacy/util.py | 30 ++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index e0ddc86c5..ce26e63a4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -555,6 +555,7 @@ class Errors(object): E193 = ("Unable to resize vectors in place if the resized vector dimension " "({new_dim}) is not the same as the current vector dimension " "({curr_dim}).") + E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") @add_codes diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 120fb6e28..8f543e86a 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -6,6 +6,7 @@ from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups +from spacy import util @pytest.fixture @@ -38,3 +39,41 @@ def test_lookup_lemmatization(vocab): assert doc[0].lemma_ == "dog" assert doc[1].text == "dogses" assert doc[1].lemma_ == "dogses" + + +def test_create_from_words_and_text(vocab): + # no whitespace in words + words = ["'", "dogs", "'", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # partial whitespace in words + words = [" ", "'", "dogs", "'", "\n\n", "run", " "] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # non-standard whitespace tokens + words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words, text) + doc = Doc(vocab, words=words, spaces=spaces) + assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "] + assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""] + assert doc.text == text + assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()] + + # mismatch between words and text + with pytest.raises(ValueError): + words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] + text = " 'dogs'\n\nrun " + (words, spaces) = util.get_words_and_spaces(words + ["away"], text) diff --git a/spacy/util.py b/spacy/util.py index 9b96b2f5e..706fe303d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -755,6 +755,36 @@ def get_serialization_exclude(serializers, exclude, kwargs): return exclude +def get_words_and_spaces(words, text): + if "".join("".join(words).split())!= "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + text_words = [] + text_spaces = [] + text_pos = 0 + # normalize words to remove all whitespace tokens + norm_words = [word for word in words if not word.isspace()] + # align words with text + for word in norm_words: + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) + if word_start > 0: + text_words.append(text[text_pos:text_pos+word_start]) + text_spaces.append(False) + text_pos += word_start + text_words.append(word) + text_spaces.append(False) + text_pos += len(word) + if text_pos < len(text) and text[text_pos] == " ": + text_spaces[-1] = True + text_pos += 1 + if text_pos < len(text): + text_words.append(text[text_pos:]) + text_spaces.append(False) + return (text_words, text_spaces) + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty From 98c59027ed12131272b0aa46cdd89e378a13944b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 15 Apr 2020 13:49:47 +0200 Subject: [PATCH 068/105] Use max(uint64) for OOV lexeme rank (#5303) * Use max(uint64) for OOV lexeme rank * Add test for default OOV rank * Revert back to thinc==7.4.0 Requiring the updated version of thinc was unnecessary. * Define OOV_RANK in one place Define OOV_RANK in one place in `util`. * Fix formatting [ci skip] * Switch to external definitions of max(uint64) Switch to external defintions of max(uint64) and confirm that they are equal. --- spacy/_ml.py | 2 +- spacy/cli/init_model.py | 4 ++-- spacy/lexeme.pxd | 1 + spacy/lexeme.pyx | 3 +++ spacy/tests/vocab_vectors/test_lexeme.py | 9 +++++++++ spacy/util.py | 2 ++ spacy/vocab.pyx | 6 +++--- 7 files changed, 21 insertions(+), 6 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index ee7e59218..2a758accc 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -289,7 +289,7 @@ def link_vectors_to_models(vocab): if word.orth in vectors.key2row: word.rank = vectors.key2row[word.orth] else: - word.rank = 0 + word.rank = util.OOV_RANK data = ops.asarray(vectors.data) # Set an entry here, so that vectors are accessed by StaticVectors # (unideal, I know) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3fa0cc890..0bdd4000e 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -16,7 +16,7 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning -from ..util import ensure_path, get_lang_class +from ..util import ensure_path, get_lang_class, OOV_RANK try: import ftfy @@ -148,7 +148,7 @@ def create_model(lang, lex_attrs, name=None): lang_class = get_lang_class(lang) nlp = lang_class() for lexeme in nlp.vocab: - lexeme.rank = 0 + lexeme.rank = OOV_RANK lex_added = 0 for attrs in lex_attrs: if "settings" in attrs: diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 048f8016e..f31733374 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -10,6 +10,7 @@ from numpy cimport ndarray cdef LexemeC EMPTY_LEXEME +cdef attr_t OOV_RANK cdef class Lexeme: cdef LexemeC* c diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5c981bc25..21644e37b 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -11,6 +11,7 @@ np.import_array() import numpy from thinc.neural.util import get_array_module +from libc.stdint cimport UINT64_MAX from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP @@ -21,7 +22,9 @@ from .attrs import intify_attrs from .errors import Errors, Warnings, user_warning +OOV_RANK = UINT64_MAX memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) +EMPTY_LEXEME.id = OOV_RANK cdef class Lexeme: diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d84a56981..b57c6705a 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import pytest +import numpy from spacy.attrs import IS_ALPHA, IS_DIGIT +from spacy.util import OOV_RANK @pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)]) @@ -69,3 +71,10 @@ def test_lexeme_bytes_roundtrip(en_vocab): assert one.orth == alpha.orth assert one.lower == alpha.lower assert one.lower_ == alpha.lower_ + + +def test_vocab_lexeme_oov_rank(en_vocab): + """Test that default rank is OOV_RANK.""" + lex = en_vocab["word"] + assert OOV_RANK == numpy.iinfo(numpy.uint64).max + assert lex.rank == OOV_RANK diff --git a/spacy/util.py b/spacy/util.py index 706fe303d..1c627af46 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -12,6 +12,7 @@ from thinc.neural.ops import NumpyOps import functools import itertools import numpy.random +import numpy import srsly import catalogue import sys @@ -34,6 +35,7 @@ from .errors import Errors, Warnings, deprecation_warning _data_path = Path(__file__).parent / "data" _PRINT_ENV = False +OOV_RANK = numpy.iinfo(numpy.uint64).max class registry(object): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8f95c567c..0f3223025 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -7,7 +7,7 @@ import srsly from collections import OrderedDict from thinc.neural.util import get_array_module -from .lexeme cimport EMPTY_LEXEME +from .lexeme cimport EMPTY_LEXEME, OOV_RANK from .lexeme cimport Lexeme from .typedefs cimport attr_t from .tokens.token cimport Token @@ -165,9 +165,9 @@ cdef class Vocab: lex.orth = self.strings.add(string) lex.length = len(string) if self.vectors is not None: - lex.id = self.vectors.key2row.get(lex.orth, 0) + lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK) else: - lex.id = 0 + lex.id = OOV_RANK if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) From 1eef60c658e4aaa7b3ddb4dab2dac170ceea2c2c Mon Sep 17 00:00:00 2001 From: Thomas Thiebaud Date: Wed, 15 Apr 2020 13:50:46 +0200 Subject: [PATCH 069/105] Add spacy_fastlang to universe (#5271) * Add spacy_fastlang to universe * Sign SCA --- .github/contributors/thomasthiebaud.md | 106 +++++++++++++++++++++++++ website/meta/universe.json | 24 ++++++ 2 files changed, 130 insertions(+) create mode 100644 .github/contributors/thomasthiebaud.md diff --git a/.github/contributors/thomasthiebaud.md b/.github/contributors/thomasthiebaud.md new file mode 100644 index 000000000..bdbf0ec50 --- /dev/null +++ b/.github/contributors/thomasthiebaud.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, + object code, patch, tool, sample, graphic, specification, manual, + documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and + registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment + to any third party, you hereby grant to us a perpetual, irrevocable, + non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your + contribution. The rights that you grant to us under these terms are effective + on the date you first submitted a contribution to us, even if your submission + took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + - Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + - to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + - each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable + U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT + mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | --------------- | +| Name | Thomas Thiebaud | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-07 | +| GitHub username | thomasthiebaud | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index b5e1dbde0..8da96a026 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2093,6 +2093,30 @@ "predict_output = clf.predict(predict_input)" ], "category": ["standalone"] + }, + { + "id": "spacy_fastlang", + "title": "Spacy FastLang", + "slogan": "Language detection done fast", + "description": "Fast language detection using FastText and Spacy.", + "github": "thomasthiebaud/spacy-fastlang", + "pip": "spacy_fastlang", + "code_example": [ + "import spacy", + "from spacy_fastlang import LanguageDetector", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe(LanguageDetector())", + "doc = nlp('Life is like a box of chocolates. You never know what you're gonna get.')", + "", + "assert doc._.language == 'en'", + "assert doc._.language_score >= 0.8" + ], + "author": "Thomas Thiebaud", + "author_links": { + "github": "thomasthiebaud" + }, + "category": ["pipeline"] } ], From 1ca32d8f9c800eb36e912dc1fa7b173edf7f2c3c Mon Sep 17 00:00:00 2001 From: Paolo Arduin Date: Wed, 15 Apr 2020 13:51:33 +0200 Subject: [PATCH 070/105] Matcher support for Span as well as Doc (#5113) * Matcher support for Span, as well as Doc #5056 * Removes an import unused * Signed contributors agreement * Code optimization and better test * Add error message for bad Matcher call argument * Fix merging --- .github/contributors/paoloq.md | 2 +- spacy/errors.py | 1 + spacy/matcher/matcher.pyx | 36 ++++++++++++++----------- spacy/tests/matcher/test_matcher_api.py | 11 +++++++- 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/.github/contributors/paoloq.md b/.github/contributors/paoloq.md index 84b28c8ef..0fac70c9a 100644 --- a/.github/contributors/paoloq.md +++ b/.github/contributors/paoloq.md @@ -5,7 +5,7 @@ This spaCy Contributor Agreement (**"SCA"**) is based on the The SCA applies to any contribution that you make to any product or project managed by us (the **"project"**), and sets out the intellectual property rights you grant to us in the contributed materials. The term **"us"** shall mean -[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +[ExplosionAI GmbH](https://explosion.ai/legal). The term **"you"** shall mean the person or entity identified below. If you agree to be bound by these terms, fill in the information requested diff --git a/spacy/errors.py b/spacy/errors.py index ce26e63a4..b1cdb89ec 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -556,6 +556,7 @@ class Errors(object): "({new_dim}) is not the same as the current vector dimension " "({curr_dim}).") E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") + E195 = ("Matcher can be called on {good} only, got {got}.") @add_codes diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 43480b46e..9e0fe2812 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -14,6 +14,7 @@ from ..typedefs cimport attr_t from ..structs cimport TokenC from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr +from ..tokens.span cimport Span from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA @@ -211,22 +212,29 @@ cdef class Matcher: else: yield doc - def __call__(self, Doc doc): + def __call__(self, object doc_or_span): """Find all token sequences matching the supplied pattern. - doc (Doc): The document to match over. + doc_or_span (Doc or Span): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ + if isinstance(doc_or_span, Doc): + doc = doc_or_span + length = len(doc) + elif isinstance(doc_or_span, Span): + doc = doc_or_span.doc + length = doc_or_span.end - doc_or_span.start + else: + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E156.format()) - matches = find_matches(&self.patterns[0], self.patterns.size(), doc, - extensions=self._extensions, - predicates=self._extra_predicates) + matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, + extensions=self._extensions, predicates=self._extra_predicates) for i, (key, start, end) in enumerate(matches): on_match = self._callbacks.get(key, None) if on_match is not None: @@ -248,9 +256,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher - -cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, - predicates=tuple()): +cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples. @@ -268,18 +274,18 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, cdef int i, j, nr_extra_attr cdef Pool mem = Pool() output = [] - if doc.length == 0: + if length == 0: # avoid any processing or mem alloc if the document is empty return output if len(predicates) > 0: - predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char)) + predicate_cache = mem.alloc(length * len(predicates), sizeof(char)) if extensions is not None and len(extensions) >= 1: nr_extra_attr = max(extensions.values()) + 1 - extra_attr_values = mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) + extra_attr_values = mem.alloc(length * nr_extra_attr, sizeof(attr_t)) else: nr_extra_attr = 0 - extra_attr_values = mem.alloc(doc.length, sizeof(attr_t)) - for i, token in enumerate(doc): + extra_attr_values = mem.alloc(length, sizeof(attr_t)) + for i, token in enumerate(doc_or_span): for name, index in extensions.items(): value = token._.get(name) if isinstance(value, basestring): @@ -287,11 +293,11 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, extra_attr_values[i * nr_extra_attr + index] = value # Main loop cdef int nr_predicate = len(predicates) - for i in range(doc.length): + for i in range(length): for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) transition_states(states, matches, predicate_cache, - doc[i], extra_attr_values, predicates) + doc_or_span[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 2e5e64aac..0295ada82 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,7 +6,6 @@ import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token - from ..doc.test_underscore import clean_underscore # noqa: F401 @@ -470,3 +469,13 @@ def test_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) + + +def test_matcher_span(matcher): + text = "JavaScript is good but Java is better" + doc = Doc(matcher.vocab, words=text.split()) + span_js = doc[:3] + span_java = doc[4:] + assert len(matcher(doc)) == 2 + assert len(matcher(span_js)) == 1 + assert len(matcher(span_java)) == 1 From dac70f29eb3b1f21ae9e2c6346666bf6a46307b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Harinck?= Date: Thu, 16 Apr 2020 11:32:09 +0200 Subject: [PATCH 071/105] contrib: add contributor agreement for user sebastienharinck (#5316) --- .github/contributors/sebastienharinck.md | 106 +++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/sebastienharinck.md diff --git a/.github/contributors/sebastienharinck.md b/.github/contributors/sebastienharinck.md new file mode 100644 index 000000000..e0fddeba5 --- /dev/null +++ b/.github/contributors/sebastienharinck.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------------------- | +| Name | Sébastien Harinck | +| Company name (if applicable) | Odaxiom | +| Title or role (if applicable) | ML Engineer | +| Date | 2020-04-15 | +| GitHub username | sebastienharinck | +| Website (optional) | [https://odaxiom.com](https://odaxiom.com) | \ No newline at end of file From 663333c3b2bad90915d1a48a626ca1275b7ef077 Mon Sep 17 00:00:00 2001 From: Jakob Jul Elben Date: Thu, 16 Apr 2020 13:29:02 +0200 Subject: [PATCH 072/105] Fixes #5413 (#5315) * Fix 5314 * Add contributor * Resolve requested changes Co-authored-by: Jakob Jul Elben --- .github/contributors/elben10 | 106 ++++++++++++++++++ .../wikipedia_processor.py | 6 +- spacy/tests/regression/test_issue5314.py | 18 +++ 3 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/elben10 create mode 100644 spacy/tests/regression/test_issue5314.py diff --git a/.github/contributors/elben10 b/.github/contributors/elben10 new file mode 100644 index 000000000..1eb4656dc --- /dev/null +++ b/.github/contributors/elben10 @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jakob Jul Elben | +| Company name (if applicable) | N/A | +| Title or role (if applicable) | N/A | +| Date | April 16th, 2020 | +| GitHub username | elben10 | +| Website (optional) | N/A | diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index ed3c35c43..649d48fe5 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -30,7 +30,8 @@ logger = logging.getLogger(__name__) title_regex = re.compile(r"(?<=).*(?=)") id_regex = re.compile(r"(?<=)\d*(?=)") -text_regex = re.compile(r"(?<=).*(?=)") +text_regex = re.compile(r"(?<=).*(?=[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" +new_format_text = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" +potential_future_format = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" + + +@pytest.mark.parametrize( + "text", [old_format_text, new_format_text, potential_future_format] +) +def test_issue5314(text): + title = "Arkæologi" + clean_text, _ = _process_wp_text(title, text, {}) + + expected_text = "Arkæologi er studiet af tidligere tiders menneskelige aktivitet, primært gennem studiet af menneskets materielle levn." + assert clean_text.strip() == expected_text From 068146d4ca2506a5d9a9f60ec8ad7e983d554ff9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 16 Apr 2020 14:45:25 +0200 Subject: [PATCH 073/105] Update netlify.toml [ci skip] --- netlify.toml | 62 ++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/netlify.toml b/netlify.toml index 45bd2c3b6..be809f1d4 100644 --- a/netlify.toml +++ b/netlify.toml @@ -7,42 +7,42 @@ redirects = [ {from = "https://alpha.spacy.io/*", to = "https://spacy.io", force = true}, {from = "http://alpha.spacy.io/*", to = "https://spacy.io", force = true}, # Old demos - {from = "/demos/*", to = "https://explosion.ai/demos/:splat"}, + {from = "/demos/*", to = "https://explosion.ai/demos/:splat", force = true}, # Old blog - {from = "/blog/*", to = "https://explosion.ai/blog/:splat"}, - {from = "/feed", to = "https://explosion.ai/feed"}, - {from = "/feed.xml", to = "https://explosion.ai/feed"}, + {from = "/blog/*", to = "https://explosion.ai/blog/:splat", force = true}, + {from = "/feed", to = "https://explosion.ai/feed", force = true}, + {from = "/feed.xml", to = "https://explosion.ai/feed", force = true}, # Old documentation pages (1.x) - {from = "/docs/usage/processing-text", to = "/usage/linguistic-features"}, - {from = "/docs/usage/deep-learning", to = "/usage/training"}, - {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging"}, - {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse"}, - {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities"}, - {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity"}, - {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization"}, - {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines"}, - {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines"}, - {from = "/docs/usage/training-ner", to = "/usage/training#ner"}, - {from = "/docs/usage/tutorials", to = "/usage/examples"}, - {from = "/docs/usage/data-model", to = "/api"}, - {from = "/docs/usage/cli", to = "/api/cli"}, - {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, - {from = "/docs/api/language-models", to = "/usage/models#languages"}, - {from = "/docs/api/spacy", to = "/docs/api/top-level"}, - {from = "/docs/api/displacy", to = "/api/top-level#displacy"}, - {from = "/docs/api/util", to = "/api/top-level#util"}, - {from = "/docs/api/features", to = "/models/#architecture"}, - {from = "/docs/api/philosophy", to = "/usage/spacy-101"}, - {from = "/docs/usage/showcase", to = "/universe"}, - {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom"}, - {from = "/tutorials", to = "/usage/examples"}, + {from = "/docs/usage/processing-text", to = "/usage/linguistic-features", force = true}, + {from = "/docs/usage/deep-learning", to = "/usage/training", force = true}, + {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging", force = true}, + {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse", force = true}, + {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities", force = true}, + {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity", force = true}, + {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true}, + {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true}, + {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true}, + {from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true}, + {from = "/docs/usage/tutorials", to = "/usage/examples", force = true}, + {from = "/docs/usage/data-model", to = "/api", force = true}, + {from = "/docs/usage/cli", to = "/api/cli", force = true}, + {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true}, + {from = "/docs/api/language-models", to = "/usage/models#languages", force = true}, + {from = "/docs/api/spacy", to = "/docs/api/top-level", force = true}, + {from = "/docs/api/displacy", to = "/api/top-level#displacy", force = true}, + {from = "/docs/api/util", to = "/api/top-level#util", force = true}, + {from = "/docs/api/features", to = "/models/#architecture", force = true}, + {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true}, + {from = "/docs/usage/showcase", to = "/universe", force = true}, + {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true}, + {from = "/tutorials", to = "/usage/examples", force = true}, # Rewrite all other docs pages to / {from = "/docs/*", to = "/:splat"}, # Updated documentation pages - {from = "/usage/resources", to = "/universe"}, - {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, - {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"}, - {from = "/models/comparison", to = "/models"}, + {from = "/usage/resources", to = "/universe", force = true}, + {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true}, + {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching", force = true}, + {from = "/models/comparison", to = "/models", force = true}, {from = "/api/#section-cython", to = "/api/cython", force = true}, {from = "/api/#cython", to = "/api/cython", force = true}, {from = "/api/sentencesegmenter", to="/api/sentencizer"}, From fb73d4943a91d18cd36ded98994a932515f4bf05 Mon Sep 17 00:00:00 2001 From: laszabine Date: Thu, 16 Apr 2020 20:00:18 +0200 Subject: [PATCH 074/105] Amend documentation to Language.evaluate (#5319) * Specified usage of arguments to Language.evaluate * Created contributor agreement --- .github/contributors/laszabine.md | 106 ++++++++++++++++++++++++++++++ website/docs/api/language.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/laszabine.md diff --git a/.github/contributors/laszabine.md b/.github/contributors/laszabine.md new file mode 100644 index 000000000..c1a4a3a6b --- /dev/null +++ b/.github/contributors/laszabine.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sabine Laszakovits | +| Company name (if applicable) | Austrian Academy of Sciences | +| Title or role (if applicable) | Data analyst | +| Date | 2020-04-16 | +| GitHub username | laszabine | +| Website (optional) | https://sabine.laszakovits.net | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d548a1f64..97dfbf100 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -136,7 +136,7 @@ Evaluate a model's pipeline components. | Name | Type | Description | | -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects or `(text, annotations)` of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). | +| `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects, such that the `Doc` objects contain the predictions and the `GoldParse` objects the correct annotations. Alternatively, `(text, annotations)` tuples of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). | | `verbose` | bool | Print debugging information. | | `batch_size` | int | The batch size to use. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | From f7471abd82c1cbf6dc42f299eb0237a174f86da5 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 18 Apr 2020 17:01:53 +0200 Subject: [PATCH 075/105] Add pkuseg and serialization support for Chinese (#5308) * Add pkuseg and serialization support for Chinese Add support for pkuseg alongside jieba * Specify model through `Language` meta: * split on characters (if no word segmentation packages are installed) ``` Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": False}}}) ``` * jieba (remains the default tokenizer if installed) ``` Chinese() Chinese(meta={"tokenizer": {"config": {"use_jieba": True}}}) # explicit ``` * pkuseg ``` Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}}}) ``` * The new tokenizer setting `require_pkuseg` is used to override `use_jieba` default, which is intended for models that provide a pkuseg model: ``` nlp_pkuseg = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "default", "require_pkuseg": True}}}) nlp = Chinese() # has `use_jieba` as `True` by default nlp.from_bytes(nlp_pkuseg.to_bytes()) # `require_pkuseg` overrides `use_jieba` when calling the tokenizer ``` Add support for serialization of tokenizer settings and pkuseg model, if loaded * Add sorting for `Language.to_bytes()` serialization of `Language.meta` so that the (emptied, but still present) tokenizer metadata is in a consistent position in the serialized data Extend tests to cover all three tokenizer configurations and serialization * Fix from_disk and tests without jieba or pkuseg * Load cfg first and only show error if `use_pkuseg` * Fix blank/default initialization in serialization tests * Explicitly initialize jieba's cache on init * Add serialization for pkuseg pre/postprocessors * Reformat pkuseg install message --- spacy/lang/zh/__init__.py | 297 ++++++++++++++++++++++---- spacy/language.py | 2 +- spacy/tests/conftest.py | 16 +- spacy/tests/lang/zh/test_serialize.py | 38 ++++ spacy/tests/lang/zh/test_text.py | 4 +- spacy/tests/lang/zh/test_tokenizer.py | 34 ++- 6 files changed, 329 insertions(+), 62 deletions(-) create mode 100644 spacy/tests/lang/zh/test_serialize.py diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8179b4551..2cf00d389 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,6 +1,10 @@ # coding: utf8 from __future__ import unicode_literals +import tempfile +import srsly +from pathlib import Path +from collections import OrderedDict from ...attrs import LANG from ...language import Language from ...tokens import Doc @@ -9,12 +13,19 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from .tag_map import TAG_MAP +from ... import util + + +_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" def try_jieba_import(use_jieba): try: import jieba + # segment a short text to have jieba initialize its cache in advance + list(jieba.cut("作为", cut_all=False)) + return jieba except ImportError: if use_jieba: @@ -25,59 +36,241 @@ def try_jieba_import(use_jieba): raise ImportError(msg) +def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict): + try: + import pkuseg + + if pkuseg_model: + return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) + elif use_pkuseg: + msg = ( + "Chinese.use_pkuseg is True but no pkuseg model was specified. " + "Please provide the name of a pretrained model " + "or the path to a model with " + '`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).' + ) + raise ValueError(msg) + except ImportError: + if use_pkuseg: + msg = ( + "pkuseg not installed. Either set Chinese.use_pkuseg = False, " + "or " + _PKUSEG_INSTALL_MSG + ) + raise ImportError(msg) + except FileNotFoundError: + if use_pkuseg: + msg = "Unable to load pkuseg model from: " + pkuseg_model + raise FileNotFoundError(msg) + + class ChineseTokenizer(DummyTokenizer): - def __init__(self, cls, nlp=None): + def __init__(self, cls, nlp=None, config={}): + self.use_jieba = config.get("use_jieba", cls.use_jieba) + self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg) + self.require_pkuseg = config.get("require_pkuseg", False) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.use_jieba = cls.use_jieba self.jieba_seg = try_jieba_import(self.use_jieba) + self.pkuseg_seg = try_pkuseg_import( + self.use_pkuseg, + pkuseg_model=config.get("pkuseg_model", None), + pkuseg_user_dict=config.get("pkuseg_user_dict", "default"), + ) + # remove relevant settings from config so they're not also saved in + # Language.meta + for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]: + if key in config: + del config[key] self.tokenizer = Language.Defaults().create_tokenizer(nlp) def __call__(self, text): - # use jieba - if self.use_jieba: - jieba_words = list( - [x for x in self.jieba_seg.cut(text, cut_all=False) if x] - ) - words = [jieba_words[0]] - spaces = [False] - for i in range(1, len(jieba_words)): - word = jieba_words[i] - if word.isspace(): - # second token in adjacent whitespace following a - # non-space token - if spaces[-1]: - words.append(word) - spaces.append(False) - # first space token following non-space token - elif word == " " and not words[-1].isspace(): - spaces[-1] = True - # token is non-space whitespace or any whitespace following - # a whitespace token - else: - # extend previous whitespace token with more whitespace - if words[-1].isspace(): - words[-1] += word - # otherwise it's a new whitespace token - else: - words.append(word) - spaces.append(False) - else: - words.append(word) - spaces.append(False) + use_jieba = self.use_jieba + use_pkuseg = self.use_pkuseg + if self.require_pkuseg: + use_jieba = False + use_pkuseg = True + if use_jieba: + words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) + (words, spaces) = util.get_words_and_spaces(words, text) + return Doc(self.vocab, words=words, spaces=spaces) + elif use_pkuseg: + words = self.pkuseg_seg.cut(text) + (words, spaces) = util.get_words_and_spaces(words, text) + return Doc(self.vocab, words=words, spaces=spaces) + else: + # split into individual characters + words = list(text) + (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) - # split into individual characters - words = [] - spaces = [] - for token in self.tokenizer(text): - if token.text.isspace(): - words.append(token.text) - spaces.append(False) - else: - words.extend(list(token.text)) - spaces.extend([False] * len(token.text)) - spaces[-1] = bool(token.whitespace_) - return Doc(self.vocab, words=words, spaces=spaces) + def _get_config(self): + config = OrderedDict( + ( + ("use_jieba", self.use_jieba), + ("use_pkuseg", self.use_pkuseg), + ("require_pkuseg", self.require_pkuseg), + ) + ) + return config + + def _set_config(self, config={}): + self.use_jieba = config.get("use_jieba", False) + self.use_pkuseg = config.get("use_pkuseg", False) + self.require_pkuseg = config.get("require_pkuseg", False) + + def to_bytes(self, **kwargs): + pkuseg_features_b = b"" + pkuseg_weights_b = b"" + pkuseg_processors_data = None + if self.pkuseg_seg: + with tempfile.TemporaryDirectory() as tempdir: + self.pkuseg_seg.feature_extractor.save(tempdir) + self.pkuseg_seg.model.save(tempdir) + tempdir = Path(tempdir) + with open(tempdir / "features.pkl", "rb") as fileh: + pkuseg_features_b = fileh.read() + with open(tempdir / "weights.npz", "rb") as fileh: + pkuseg_weights_b = fileh.read() + pkuseg_processors_data = ( + _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie), + self.pkuseg_seg.postprocesser.do_process, + sorted(list(self.pkuseg_seg.postprocesser.common_words)), + sorted(list(self.pkuseg_seg.postprocesser.other_words)), + ) + serializers = OrderedDict( + ( + ("cfg", lambda: srsly.json_dumps(self._get_config())), + ("pkuseg_features", lambda: pkuseg_features_b), + ("pkuseg_weights", lambda: pkuseg_weights_b), + ( + "pkuseg_processors", + lambda: srsly.msgpack_dumps(pkuseg_processors_data), + ), + ) + ) + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + pkuseg_features_b = b"" + pkuseg_weights_b = b"" + pkuseg_processors_data = None + + def deserialize_pkuseg_features(b): + nonlocal pkuseg_features_b + pkuseg_features_b = b + + def deserialize_pkuseg_weights(b): + nonlocal pkuseg_weights_b + pkuseg_weights_b = b + + def deserialize_pkuseg_processors(b): + nonlocal pkuseg_processors_data + pkuseg_processors_data = srsly.msgpack_loads(b) + + deserializers = OrderedDict( + ( + ("cfg", lambda b: self._set_config(srsly.json_loads(b))), + ("pkuseg_features", deserialize_pkuseg_features), + ("pkuseg_weights", deserialize_pkuseg_weights), + ("pkuseg_processors", deserialize_pkuseg_processors), + ) + ) + util.from_bytes(data, deserializers, []) + + if pkuseg_features_b and pkuseg_weights_b: + with tempfile.TemporaryDirectory() as tempdir: + tempdir = Path(tempdir) + with open(tempdir / "features.pkl", "wb") as fileh: + fileh.write(pkuseg_features_b) + with open(tempdir / "weights.npz", "wb") as fileh: + fileh.write(pkuseg_weights_b) + try: + import pkuseg + except ImportError: + raise ImportError( + "pkuseg not installed. To use this model, " + + _PKUSEG_INSTALL_MSG + ) + self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) + if pkuseg_processors_data: + ( + user_dict, + do_process, + common_words, + other_words, + ) = pkuseg_processors_data + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.postprocesser.do_process = do_process + self.pkuseg_seg.postprocesser.common_words = set(common_words) + self.pkuseg_seg.postprocesser.other_words = set(other_words) + + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + + def save_pkuseg_model(path): + if self.pkuseg_seg: + if not path.exists(): + path.mkdir(parents=True) + self.pkuseg_seg.model.save(path) + self.pkuseg_seg.feature_extractor.save(path) + + def save_pkuseg_processors(path): + if self.pkuseg_seg: + data = ( + _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie), + self.pkuseg_seg.postprocesser.do_process, + sorted(list(self.pkuseg_seg.postprocesser.common_words)), + sorted(list(self.pkuseg_seg.postprocesser.other_words)), + ) + srsly.write_msgpack(path, data) + + serializers = OrderedDict( + ( + ("cfg", lambda p: srsly.write_json(p, self._get_config())), + ("pkuseg_model", lambda p: save_pkuseg_model(p)), + ("pkuseg_processors", lambda p: save_pkuseg_processors(p)), + ) + ) + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + + def load_pkuseg_model(path): + try: + import pkuseg + except ImportError: + if self.use_pkuseg: + raise ImportError( + "pkuseg not installed. To use this model, " + + _PKUSEG_INSTALL_MSG + ) + if path.exists(): + self.pkuseg_seg = pkuseg.pkuseg(path) + + def load_pkuseg_processors(path): + try: + import pkuseg + except ImportError: + if self.use_pkuseg: + raise ImportError(self._pkuseg_install_msg) + if self.pkuseg_seg: + data = srsly.read_msgpack(path) + (user_dict, do_process, common_words, other_words) = data + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.postprocesser.do_process = do_process + self.pkuseg_seg.postprocesser.common_words = set(common_words) + self.pkuseg_seg.postprocesser.other_words = set(other_words) + + serializers = OrderedDict( + ( + ("cfg", lambda p: self._set_config(srsly.read_json(p))), + ("pkuseg_model", lambda p: load_pkuseg_model(p)), + ("pkuseg_processors", lambda p: load_pkuseg_processors(p)), + ) + ) + util.from_disk(path, serializers, []) class ChineseDefaults(Language.Defaults): @@ -89,10 +282,11 @@ class ChineseDefaults(Language.Defaults): tag_map = TAG_MAP writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} use_jieba = True + use_pkuseg = False @classmethod - def create_tokenizer(cls, nlp=None): - return ChineseTokenizer(cls, nlp) + def create_tokenizer(cls, nlp=None, config={}): + return ChineseTokenizer(cls, nlp, config=config) class Chinese(Language): @@ -103,4 +297,13 @@ class Chinese(Language): return self.tokenizer(text) +def _get_pkuseg_trie_data(node, path=""): + data = [] + for c, child_node in sorted(node.children.items()): + data.extend(_get_pkuseg_trie_data(child_node, path + c)) + if node.isword: + data.append((path, node.usertag)) + return data + + __all__ = ["Chinese"] diff --git a/spacy/language.py b/spacy/language.py index 56619080d..f5eff2ae9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -969,7 +969,7 @@ class Language(object): serializers = OrderedDict() serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) - serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) + serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items()))) for name, proc in self.pipeline: if name in exclude: continue diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 43c3152a0..0f14f0a27 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -231,10 +231,22 @@ def yo_tokenizer(): @pytest.fixture(scope="session") -def zh_tokenizer(): +def zh_tokenizer_char(): + return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False}) + + +@pytest.fixture(scope="session") +def zh_tokenizer_jieba(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() + +@pytest.fixture(scope="session") +def zh_tokenizer_pkuseg(): + pytest.importorskip("pkuseg") + return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}) + + @pytest.fixture(scope="session") def hy_tokenizer(): - return get_lang_class("hy").Defaults.create_tokenizer() \ No newline at end of file + return get_lang_class("hy").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py new file mode 100644 index 000000000..58133a88e --- /dev/null +++ b/spacy/tests/lang/zh/test_serialize.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.zh import Chinese +from ...util import make_tempdir + + +def zh_tokenizer_serialize(zh_tokenizer): + tokenizer_bytes = zh_tokenizer.to_bytes() + nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + zh_tokenizer.to_disk(file_path) + nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_zh_tokenizer_serialize_char(zh_tokenizer_char): + zh_tokenizer_serialize(zh_tokenizer_char) + + +def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): + zh_tokenizer_serialize(zh_tokenizer_jieba) + + +def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg): + zh_tokenizer_serialize(zh_tokenizer_pkuseg) + + +@pytest.mark.slow +def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): + nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}}) + zh_tokenizer_serialize(nlp.tokenizer) diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py index 235f597a5..3a3ccbdde 100644 --- a/spacy/tests/lang/zh/test_text.py +++ b/spacy/tests/lang/zh/test_text.py @@ -19,7 +19,7 @@ import pytest (",", False), ], ) -def test_lex_attrs_like_number(zh_tokenizer, text, match): - tokens = zh_tokenizer(text) +def test_lex_attrs_like_number(zh_tokenizer_jieba, text, match): + tokens = zh_tokenizer_jieba(text) assert len(tokens) == 1 assert tokens[0].like_num == match diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 36d94beb5..bff7b1ed1 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -5,27 +5,41 @@ import pytest # fmt: off -TOKENIZER_TESTS = [ - ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。", +TEXTS = ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",) +JIEBA_TOKENIZER_TESTS = [ + (TEXTS[0], ['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多', '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做', '为', '母语', '。']), ] +PKUSEG_TOKENIZER_TESTS = [ + (TEXTS[0], + ['作为', '语言', '而言', ',', '为', '世界', '使用', '人数', '最多', + '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做为', + '母语', '。']), +] # fmt: on -@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) -def test_zh_tokenizer(zh_tokenizer, text, expected_tokens): - zh_tokenizer.use_jieba = False - tokens = [token.text for token in zh_tokenizer(text)] +@pytest.mark.parametrize("text", TEXTS) +def test_zh_tokenizer_char(zh_tokenizer_char, text): + tokens = [token.text for token in zh_tokenizer_char(text)] assert tokens == list(text) - zh_tokenizer.use_jieba = True - tokens = [token.text for token in zh_tokenizer(text)] + +@pytest.mark.parametrize("text,expected_tokens", JIEBA_TOKENIZER_TESTS) +def test_zh_tokenizer_jieba(zh_tokenizer_jieba, text, expected_tokens): + tokens = [token.text for token in zh_tokenizer_jieba(text)] assert tokens == expected_tokens -def test_extra_spaces(zh_tokenizer): +@pytest.mark.parametrize("text,expected_tokens", PKUSEG_TOKENIZER_TESTS) +def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens): + tokens = [token.text for token in zh_tokenizer_pkuseg(text)] + assert tokens == expected_tokens + + +def test_extra_spaces(zh_tokenizer_char): # note: three spaces after "I" - tokens = zh_tokenizer("I like cheese.") + tokens = zh_tokenizer_char("I like cheese.") assert tokens[1].orth_ == " " From b919844fce1fd3b02e69ff2f3d6cc786b12f74b0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 20 Apr 2020 20:33:13 +0200 Subject: [PATCH 076/105] Tidy up and fix alignment of landing cards (#5317) --- website/src/components/landing.js | 13 ++++++-- website/src/styles/landing.module.sass | 5 +++ website/src/widgets/landing.js | 44 +++++++++++--------------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/website/src/components/landing.js b/website/src/components/landing.js index 16c342e3f..fb03d2845 100644 --- a/website/src/components/landing.js +++ b/website/src/components/landing.js @@ -46,10 +46,17 @@ export const LandingGrid = ({ cols = 3, blocks = false, children }) => ( export const LandingCol = ({ children }) =>
{children}
-export const LandingCard = ({ title, children }) => ( +export const LandingCard = ({ title, button, url, children }) => (
- {title &&

{title}

} - {children} +
+ {title &&

{title}

} +

{children}

+
+ {button && url && ( +
+ {button} +
+ )}
) diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass index d7340229b..e36e36c0a 100644 --- a/website/src/styles/landing.module.sass +++ b/website/src/styles/landing.module.sass @@ -49,12 +49,17 @@ margin-bottom: -25rem .card + display: flex + flex-direction: column padding: 3rem 2.5rem background: var(--color-back) border-radius: var(--border-radius) box-shadow: var(--box-shadow) margin-bottom: 3rem +.card-text + flex: 100% + .button width: 100% diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 2dc5d40dc..9aeec0cdc 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -79,34 +79,28 @@ const Landing = ({ data }) => { in Python - -

- spaCy is designed to help you do real work — to build real products, or - gather real insights. The library respects your time, and tries to avoid - wasting it. It's easy to install, and its API is simple and productive. We - like to think of spaCy as the Ruby on Rails of Natural Language Processing. -

- Get started + + spaCy is designed to help you do real work — to build real products, or gather + real insights. The library respects your time, and tries to avoid wasting it. + It's easy to install, and its API is simple and productive. We like to think of + spaCy as the Ruby on Rails of Natural Language Processing. - -

- spaCy excels at large-scale information extraction tasks. It's written from - the ground up in carefully memory-managed Cython. Independent research in - 2015 found spaCy to be the fastest in the world. If your application needs - to process entire web dumps, spaCy is the library you want to be using. -

- Facts & Figures + + spaCy excels at large-scale information extraction tasks. It's written from the + ground up in carefully memory-managed Cython. Independent research in 2015 found + spaCy to be the fastest in the world. If your application needs to process + entire web dumps, spaCy is the library you want to be using. - -

- spaCy is the best way to prepare text for deep learning. It interoperates - seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of - Python's awesome AI ecosystem. With spaCy, you can easily construct - linguistically sophisticated statistical models for a variety of NLP - problems. -

- Read more + + spaCy is the best way to prepare text for deep learning. It interoperates + seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of + Python's awesome AI ecosystem. With spaCy, you can easily construct + linguistically sophisticated statistical models for a variety of NLP problems.
From bf5c13d17021540bd30fbbb1c251984b5d8f1fc0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 20 Apr 2020 22:06:53 +0200 Subject: [PATCH 077/105] Modify jieba install message (#5328) Modify jieba install message to instruct the user to use `ChineseDefaults.use_jieba = False` so that it's possible to load pkuseg-only models without jieba installed. --- spacy/lang/zh/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 2cf00d389..701e696a4 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -30,8 +30,10 @@ def try_jieba_import(use_jieba): except ImportError: if use_jieba: msg = ( - "Jieba not installed. Either set Chinese.use_jieba = False, " - "or install it https://github.com/fxsjy/jieba" + "Jieba not installed. Either set the default to False with " + "`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, " + "or install it with `pip install jieba` or from " + "https://github.com/fxsjy/jieba" ) raise ImportError(msg) From 521f3610527998e3ccbd7591f1df95e66ed56350 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 21 Apr 2020 19:31:03 +0200 Subject: [PATCH 078/105] Switch to new gold.align method (#5334) * Switch from original `_align` to new simpler alignment algorithm from #4526 * Remove alignment normalizations beyond whitespace and lowercasing --- setup.py | 1 - spacy/_align.pyx | 255 -------------------------------------- spacy/gold.pyx | 54 +------- spacy/tests/test_align.py | 79 ------------ spacy/tests/test_gold.py | 3 +- 5 files changed, 2 insertions(+), 390 deletions(-) delete mode 100644 spacy/_align.pyx delete mode 100644 spacy/tests/test_align.py diff --git a/setup.py b/setup.py index 1156e7cde..62a09aa73 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ PACKAGES = find_packages() MOD_NAMES = [ - "spacy._align", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", diff --git a/spacy/_align.pyx b/spacy/_align.pyx deleted file mode 100644 index 6786ec7ba..000000000 --- a/spacy/_align.pyx +++ /dev/null @@ -1,255 +0,0 @@ -# cython: infer_types=True -'''Do Levenshtein alignment, for evaluation of tokenized input. - -Random notes: - - r i n g - 0 1 2 3 4 -r 1 0 1 2 3 -a 2 1 1 2 3 -n 3 2 2 1 2 -g 4 3 3 2 1 - -0,0: (1,1)=min(0+0,1+1,1+1)=0 S -1,0: (2,1)=min(1+1,0+1,2+1)=1 D -2,0: (3,1)=min(2+1,3+1,1+1)=2 D -3,0: (4,1)=min(3+1,4+1,2+1)=3 D -0,1: (1,2)=min(1+1,2+1,0+1)=1 D -1,1: (2,2)=min(0+1,1+1,1+1)=1 S -2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I -3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I -0,2: (1,3)=min(2+1,3+1,1+1)=2 I -1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I -2,2: (3,3) -3,2: (4,3) -At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?" - -We know the costs to transition: - -S[:i] -> T[:j] (at D[i,j]) -S[:i+1] -> T[:j] (at D[i+1,j]) -S[:i] -> T[:j+1] (at D[i,j+1]) - -Further, now we can transform: -S[:i+1] -> S[:i] (DEL) for 1, -T[:j+1] -> T[:j] (INS) for 1. -S[i+1] -> T[j+1] (SUB) for 0 or 1 - -Therefore we have the costs: -SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j]) -i.e. D[i, j] + S[i+1] != T[j+1] -INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j]) -i.e. D[i+1,j] + 1 -DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) -i.e. D[i,j+1] + 1 - - Source string S has length m, with index i - Target string T has length n, with index j - - Output two alignment vectors: i2j (length m) and j2i (length n) - # function LevenshteinDistance(char s[1..m], char t[1..n]): - # for all i and j, d[i,j] will hold the Levenshtein distance between - # the first i characters of s and the first j characters of t - # note that d has (m+1)*(n+1) values - # set each element in d to zero - ring rang - - r i n g - - 0 0 0 0 0 - r 0 0 0 0 0 - a 0 0 0 0 0 - n 0 0 0 0 0 - g 0 0 0 0 0 - - # source prefixes can be transformed into empty string by - # dropping all characters - # d[i, 0] := i - ring rang - - r i n g - - 0 0 0 0 0 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - - # target prefixes can be reached from empty source prefix - # by inserting every character - # d[0, j] := j - - r i n g - - 0 1 2 3 4 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - -''' -from __future__ import unicode_literals -from libc.stdint cimport uint32_t -import numpy -cimport numpy as np -from .compat import unicode_ -from murmurhash.mrmr cimport hash32 - - -def align(S, T): - cdef int m = len(S) - cdef int n = len(T) - cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') - cdef np.ndarray i2j = numpy.zeros((m,), dtype='i') - cdef np.ndarray j2i = numpy.zeros((n,), dtype='i') - - cdef np.ndarray S_arr = _convert_sequence(S) - cdef np.ndarray T_arr = _convert_sequence(T) - - fill_matrix(matrix.data, - S_arr.data, m, T_arr.data, n) - fill_i2j(i2j, matrix) - fill_j2i(j2i, matrix) - for i in range(i2j.shape[0]): - if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): - i2j[i] = -1 - for j in range(j2i.shape[0]): - if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): - j2i[j] = -1 - return matrix[-1,-1], i2j, j2i, matrix - - -def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): - '''Let's say we had: - - Guess: [aa bb cc dd] - Truth: [aa bbcc dd] - i2j: [0, None, -2, 2] - j2i: [0, -2, 3] - - We want: - - i2j_multi: {1: 1, 2: 1} - j2i_multi: {} - ''' - i2j_miss = _get_regions(i2j, i_lengths) - j2i_miss = _get_regions(j2i, j_lengths) - - i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) - return i2j_multi, j2i_multi - - -def _get_regions(alignment, lengths): - regions = {} - start = None - offset = 0 - for i in range(len(alignment)): - if alignment[i] < 0: - if start is None: - start = offset - regions.setdefault(start, []) - regions[start].append(i) - else: - start = None - offset += lengths[i] - return regions - - -def _get_mapping(miss1, miss2, lengths1, lengths2): - i2j = {} - j2i = {} - for start, region1 in miss1.items(): - if not region1 or start not in miss2: - continue - region2 = miss2[start] - if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2): - j = region2.pop(0) - buff = [] - # Consume tokens from region 1, until we meet the length of the - # first token in region2. If we do, align the tokens. If - # we exceed the length, break. - while region1: - buff.append(region1.pop(0)) - if sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - j += 1 - buff = [] - elif sum(lengths1[i] for i in buff) > lengths2[j]: - break - else: - if buff and sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - return i2j, j2i - - -def _convert_sequence(seq): - if isinstance(seq, numpy.ndarray): - return numpy.ascontiguousarray(seq, dtype='uint32_t') - cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32') - cdef bytes item_bytes - for i, item in enumerate(seq): - if item == "``": - item = '"' - elif item == "''": - item = '"' - if isinstance(item, unicode): - item_bytes = item.encode('utf8') - else: - item_bytes = item - output[i] = hash32(item_bytes, len(item_bytes), 0) - return output - - -cdef void fill_matrix(int* D, - const int* S, int m, const int* T, int n) nogil: - m1 = m+1 - n1 = n+1 - for i in range(m1*n1): - D[i] = 0 - - for i in range(m1): - D[i*n1] = i - - for j in range(n1): - D[j] = j - - cdef int sub_cost, ins_cost, del_cost - for j in range(n): - for i in range(m): - i_j = i*n1 + j - i1_j1 = (i+1)*n1 + j+1 - i1_j = (i+1)*n1 + j - i_j1 = i*n1 + j+1 - if S[i] != T[j]: - sub_cost = D[i_j] + 1 - else: - sub_cost = D[i_j] - del_cost = D[i_j1] + 1 - ins_cost = D[i1_j] + 1 - best = min(min(sub_cost, ins_cost), del_cost) - D[i1_j1] = best - - -cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *: - j = D.shape[1]-2 - cdef int i = D.shape[0]-2 - while i >= 0: - while D[i+1, j] < D[i+1, j+1]: - j -= 1 - if D[i, j+1] < D[i+1, j+1]: - i2j[i] = -1 - else: - i2j[i] = j - j -= 1 - i -= 1 - -cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *: - i = D.shape[0]-2 - cdef int j = D.shape[1]-2 - while j >= 0: - while D[i, j+1] < D[i+1, j+1]: - i -= 1 - if D[i+1, j] < D[i+1, j+1]: - j2i[j] = -1 - else: - j2i[j] = i - i -= 1 - j -= 1 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 07fd3bdd0..a41f06898 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -21,7 +21,6 @@ from .util import minibatch, itershuffle from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek -USE_NEW_ALIGN = False punct_re = re.compile(r"\W") @@ -73,57 +72,8 @@ def merge_sents(sents): return [(m_deps, (m_cats, m_brackets))] -_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] - - def _normalize_for_alignment(tokens): - tokens = [w.replace(" ", "").lower() for w in tokens] - output = [] - for token in tokens: - token = token.replace(" ", "").lower() - for before, after in _ALIGNMENT_NORM_MAP: - token = token.replace(before, after) - output.append(token) - return output - - -def _align_before_v2_2_2(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations, using the Levenshtein - algorithm. The alignment is case-insensitive. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - from . import _align - if tokens_a == tokens_b: - alignment = numpy.arange(len(tokens_a)) - return 0, alignment, alignment, {}, {} - tokens_a = [w.replace(" ", "").lower() for w in tokens_a] - tokens_b = [w.replace(" ", "").lower() for w in tokens_b] - cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], - [len(w) for w in tokens_b]) - for i, j in list(i2j_multi.items()): - if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: - i2j[i] = j - i2j_multi.pop(i) - for j, i in list(j2i_multi.items()): - if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: - j2i[j] = i - j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi + return [w.replace(" ", "").lower() for w in tokens] def align(tokens_a, tokens_b): @@ -144,8 +94,6 @@ def align(tokens_a, tokens_b): * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other direction. """ - if not USE_NEW_ALIGN: - return _align_before_v2_2_2(tokens_a, tokens_b) tokens_a = _normalize_for_alignment(tokens_a) tokens_b = _normalize_for_alignment(tokens_b) cost = 0 diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py deleted file mode 100644 index d6bbab04e..000000000 --- a/spacy/tests/test_align.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy._align import align, multi_align - - -@pytest.mark.parametrize( - "string1,string2,cost", - [ - ("hello", "hell", 1), - ("rat", "cat", 1), - ("rat", "rat", 0), - ("rat", "catsie", 4), - ("t", "catsie", 5), - ], -) -def test_align_costs(string1, string2, cost): - output_cost, i2j, j2i, matrix = align(string1, string2) - assert output_cost == cost - - -@pytest.mark.parametrize( - "string1,string2,i2j", - [ - ("hello", "hell", [0, 1, 2, 3, -1]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2]), - ("t", "catsie", [2]), - ], -) -def test_align_i2j(string1, string2, i2j): - output_cost, output_i2j, j2i, matrix = align(string1, string2) - assert list(output_i2j) == i2j - - -@pytest.mark.parametrize( - "string1,string2,j2i", - [ - ("hello", "hell", [0, 1, 2, 3]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2, -1, -1, -1]), - ("t", "catsie", [-1, -1, 0, -1, -1, -1]), - ], -) -def test_align_i2j_2(string1, string2, j2i): - output_cost, output_i2j, output_j2i, matrix = align(string1, string2) - assert list(output_j2i) == j2i - - -def test_align_strings(): - words1 = ["hello", "this", "is", "test!"] - words2 = ["hellothis", "is", "test", "!"] - cost, i2j, j2i, matrix = align(words1, words2) - assert cost == 4 - assert list(i2j) == [-1, -1, 1, -1] - assert list(j2i) == [-1, 2, -1, -1] - - -def test_align_many_to_one(): - words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] - words2 = ["ab", "bc", "e", "fg", "h"] - cost, i2j, j2i, matrix = align(words1, words2) - assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] - lengths1 = [len(w) for w in words1] - lengths2 = [len(w) for w in words2] - i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2) - assert i2j_multi[0] == 0 - assert i2j_multi[1] == 0 - assert i2j_multi[2] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[5] == 3 - assert i2j_multi[6] == 3 - - assert j2i_multi[0] == 1 - assert j2i_multi[1] == 3 diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index fbdb3155b..b546e079b 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -177,13 +177,12 @@ def test_roundtrip_docs_to_json(): assert cats["BAKING"] == goldparse.cats["BAKING"] -@pytest.mark.skip(reason="skip while we have backwards-compatible alignment") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), ( - ["a", "b", "``", "c"], + ["a", "b", '"', "c"], ['ab"', "c"], (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ), From 481574cbc865211e35faf6e36f5ece203ee59e60 Mon Sep 17 00:00:00 2001 From: Mike <34043825+Mlawrence95@users.noreply.github.com> Date: Tue, 21 Apr 2020 11:35:12 -0700 Subject: [PATCH 079/105] [minor doc change] embedding vis. link is broken in `website/docs/usage/examples.md` (#5325) * The embedding vis. link is broken The first link seems to be reasonable for now unless someone has an updated embedding vis they want to share? * contributor agreement * Update Mlawrence95.md * Update website/docs/usage/examples.md Co-Authored-By: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- .github/contributors/Mlawrence95.md | 106 ++++++++++++++++++++++++++++ website/docs/usage/examples.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/Mlawrence95.md diff --git a/.github/contributors/Mlawrence95.md b/.github/contributors/Mlawrence95.md new file mode 100644 index 000000000..505d6c16f --- /dev/null +++ b/.github/contributors/Mlawrence95.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ x ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mike Lawrence | +| Company name (if applicable) | NA | +| Title or role (if applicable) | NA | +| Date | April 17, 2020 | +| GitHub username | Mlawrence95 | +| Website (optional) | | diff --git a/website/docs/usage/examples.md b/website/docs/usage/examples.md index 180b02ff4..96dc7627d 100644 --- a/website/docs/usage/examples.md +++ b/website/docs/usage/examples.md @@ -162,7 +162,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.p This script lets you load any spaCy model containing word vectors into [TensorBoard](https://projector.tensorflow.org/) to create an -[embedding visualization](https://www.tensorflow.org/versions/r1.1/get_started/embedding_viz). +[embedding visualization](https://github.com/tensorflow/tensorboard/blob/master/docs/tensorboard_projector_plugin.ipynb). ```python https://github.com/explosion/spaCy/tree/master/examples/vectors_tensorboard.py From 84e06f9fb767910011ffeff69d5895ac6eeebf23 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 23 Apr 2020 16:58:23 +0200 Subject: [PATCH 080/105] Improve GoldParse NER alignment (#5335) Improve GoldParse NER alignment by including all cases where the start and end of the NER span can be aligned, regardless of internal tokenization differences. To do this, convert BILUO tags to character offsets, check start/end alignment with `doc.char_span()`, and assign the BILUO tags for the aligned spans. Alignment for `O/-` tags is handled through the one-to-one and multi alignments. --- spacy/errors.py | 2 + spacy/gold.pyx | 86 ++++++++++++++++++++++++++++------------ spacy/tests/test_gold.py | 70 ++++++++++++++++++++++++++++++++ spacy/util.py | 2 +- 4 files changed, 133 insertions(+), 27 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b1cdb89ec..e52241be1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -110,6 +110,8 @@ class Warnings(object): W028 = ("Doc.from_array was called with a vector of type '{type}', " "but is expecting one of type 'uint64' instead. This may result " "in problems with the vocab further on in the pipeline.") + W029 = ("Unable to align tokens with entities from character offsets. " + "Discarding entity annotation for the text: {text}.") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index a41f06898..8b61de683 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -648,6 +648,9 @@ cdef class GoldParse: # if self.lenght > 0, this is modified latter. self.orig_annot = [] + # temporary doc for aligning entity annotation + entdoc = None + # avoid allocating memory if the doc does not contain any tokens if self.length > 0: if words is None: @@ -670,7 +673,25 @@ cdef class GoldParse: entities = [(ent if ent is not None else "-") for ent in entities] if not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. - entities = biluo_tags_from_offsets(doc, entities) + # Create a temporary Doc corresponding to provided words + # (to preserve gold tokenization) and text (to preserve + # character offsets). + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + entdoc_entities = biluo_tags_from_offsets(entdoc, entities) + # There may be some additional whitespace tokens in the + # temporary doc, so check that the annotations align with + # the provided words while building a list of BILUO labels. + entities = [] + words_offset = 0 + for i in range(len(entdoc_words)): + if words[i + words_offset] == entdoc_words[i]: + entities.append(entdoc_entities[i]) + else: + words_offset -= 1 + if len(entities) != len(words): + user_warning(Warnings.W029.format(text=doc.text)) + entities = ["-" for _ in words] # These are filled by the tagger/parser/entity recogniser self.c.tags = self.mem.alloc(len(doc), sizeof(int)) @@ -697,7 +718,8 @@ cdef class GoldParse: # If we under-segment, we'll have one predicted word that covers a # sequence of gold words. # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that. + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] @@ -720,7 +742,6 @@ cdef class GoldParse: self.tags[i] = tags[i2j_multi[i]] self.morphology[i] = morphology[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) - is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last if not is_last: self.heads[i] = i+1 @@ -730,30 +751,10 @@ cdef class GoldParse: if head_i: self.heads[i] = self.gold_to_cand[head_i] self.labels[i] = deps[i2j_multi[i]] - # Now set NER...This is annoying because if we've split - # got an entity word split into two, we need to adjust the - # BILUO tags. We can't have BB or LL etc. - # Case 1: O -- easy. ner_tag = entities[i2j_multi[i]] - if ner_tag == "O": - self.ner[i] = "O" - # Case 2: U. This has to become a B I* L sequence. - elif ner_tag.startswith("U-"): - if is_first: - self.ner[i] = ner_tag.replace("U-", "B-", 1) - elif is_last: - self.ner[i] = ner_tag.replace("U-", "L-", 1) - else: - self.ner[i] = ner_tag.replace("U-", "I-", 1) - # Case 3: L. If not last, change to I. - elif ner_tag.startswith("L-"): - if is_last: - self.ner[i] = ner_tag - else: - self.ner[i] = ner_tag.replace("L-", "I-", 1) - # Case 4: I. Stays correct - elif ner_tag.startswith("I-"): - self.ner[i] = ner_tag + # Assign O/- for many-to-one O/- NER tags + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] @@ -764,6 +765,39 @@ cdef class GoldParse: self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] + # Assign O/- for one-to-many O/- NER tags + for j, cand_j in enumerate(self.gold_to_cand): + if cand_j is None: + if j in j2i_multi: + i = j2i_multi[j] + ner_tag = entities[j] + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + + # If there is entity annotation and some tokens remain unaligned, + # align all entities at the character level to account for all + # possible token misalignments within the entity spans + if any([e not in ("O", "-") for e in entities]) and None in self.ner: + # If the temporary entdoc wasn't created above, initialize it + if not entdoc: + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + # Get offsets based on gold words and BILUO entities + entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) + aligned_offsets = [] + aligned_spans = [] + # Filter offsets to identify those that align with doc tokens + for offset in entdoc_offsets: + span = doc.char_span(offset[0], offset[1]) + if span and not span.text.isspace(): + aligned_offsets.append(offset) + aligned_spans.append(span) + # Convert back to BILUO for doc tokens and assign NER for all + # aligned spans + biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) + for span in aligned_spans: + for i in range(span.start, span.end): + self.ner[i] = biluo_tags[i] # Prevent whitespace that isn't within entities from being tagged as # an entity. diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index b546e079b..fc9e624eb 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -6,6 +6,7 @@ from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English from spacy.tokens import Doc +from spacy.util import get_words_and_spaces from .util import make_tempdir import pytest import srsly @@ -59,6 +60,75 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): + # one-to-many + words = ["I", "flew to", "San Francisco Valley", "."] + spaces = [True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, + words=["I", "flew", "to", "San", "Francisco", "Valley", "."], + entities=entities, + ) + assert gp.ner == ["O", "O", "U-LOC", "O"] + + # many-to-one + words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + spaces = [True, True, True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities + ) + assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + + # misaligned + words = ["I flew", "to", "San Francisco", "Valley", "."] + spaces = [True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, words=["I", "flew to", "San", "Francisco Valley", "."], entities=entities, + ) + assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"] + + # additional whitespace tokens in GoldParse words + words, spaces = get_words_and_spaces( + ["I", "flew", "to", "San Francisco", "Valley", "."], + "I flew to San Francisco Valley.", + ) + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] + gp = GoldParse( + doc, + words=["I", "flew", " ", "to", "San Francisco Valley", "."], + entities=entities, + ) + assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] + + # from issue #4791 + data = ( + "I'll return the ₹54 amount", + { + "words": ["I", "'ll", "return", "the", "₹", "54", "amount",], + "entities": [(16, 19, "MONEY")], + }, + ) + gp = GoldParse(en_tokenizer(data[0]), **data[1]) + assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"] + + data = ( + "I'll return the $54 amount", + { + "words": ["I", "'ll", "return", "the", "$", "54", "amount",], + "entities": [(16, 19, "MONEY")], + }, + ) + gp = GoldParse(en_tokenizer(data[0]), **data[1]) + assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"] + + def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] diff --git a/spacy/util.py b/spacy/util.py index 1c627af46..a5e27a210 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -758,7 +758,7 @@ def get_serialization_exclude(serializers, exclude, kwargs): def get_words_and_spaces(words, text): - if "".join("".join(words).split())!= "".join(text.split()): + if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) text_words = [] text_spaces = [] From fc91660aa289f4e8f2e809a8179e13aa55799afd Mon Sep 17 00:00:00 2001 From: sabiqueqb Date: Mon, 27 Apr 2020 13:15:08 +0530 Subject: [PATCH 081/105] Gh 5339 language class for malayalam (#5342) * Initialize Malayalam Language class * Add lex_attrs and examples for Malayalam * Add spaCy Contributor Agreement * Add test for ml tokenizer --- .github/contributors/sabiqueqb.md | 106 ++++++++++++++++++++++++++++++ spacy/lang/ml/__init__.py | 18 +++++ spacy/lang/ml/examples.py | 19 ++++++ spacy/lang/ml/lex_attrs.py | 80 ++++++++++++++++++++++ spacy/lang/ml/stop_words.py | 18 +++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/ml/test_text.py | 16 +++++ 7 files changed, 262 insertions(+) create mode 100644 .github/contributors/sabiqueqb.md create mode 100644 spacy/lang/ml/__init__.py create mode 100644 spacy/lang/ml/examples.py create mode 100644 spacy/lang/ml/lex_attrs.py create mode 100644 spacy/lang/ml/stop_words.py create mode 100644 spacy/tests/lang/ml/test_text.py diff --git a/.github/contributors/sabiqueqb.md b/.github/contributors/sabiqueqb.md new file mode 100644 index 000000000..da0f2f2a2 --- /dev/null +++ b/.github/contributors/sabiqueqb.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sabique Ahammed Lava | +| Company name (if applicable) | QBurst | +| Title or role (if applicable) | Senior Engineer | +| Date | 24 Apr 2020 | +| GitHub username | sabiqueqb | +| Website (optional) | | diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py new file mode 100644 index 000000000..d052ded1b --- /dev/null +++ b/spacy/lang/ml/__init__.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + +from ...language import Language + + +class MalayalamDefaults(Language.Defaults): + stop_words = STOP_WORDS + + +class Malayalam(Language): + lang = "ml" + Defaults = MalayalamDefaults + + +__all__ = ["Malayalam"] diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py new file mode 100644 index 000000000..a2a0ed10e --- /dev/null +++ b/spacy/lang/ml/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ml.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക", + "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി", + "എന്താണ്‌ കവാടങ്ങൾ?", + "ചുരുക്കത്തിൽ വിക്കിപീഡിയയുടെ ഉള്ളടക്കത്തിലേക്കുള്ള പടിപ്പുരകളാണ്‌‌ കവാടങ്ങൾ. അവ ലളിതവും വായനക്കാരനെ ആകർഷിക്കുന്നതുമായിരിക്കും", + "പതിനൊന്നുപേർ വീതമുള്ള രണ്ടു ടീമുകൾ കളിക്കുന്ന സംഘകായിക വിനോദമാണു ക്രിക്കറ്റ്", +] diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py new file mode 100644 index 000000000..345da8126 --- /dev/null +++ b/spacy/lang/ml/lex_attrs.py @@ -0,0 +1,80 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +# reference 2: https://www.omniglot.com/language/numbers/malayalam.htm + +_num_words = [ + "പൂജ്യം ", + "ഒന്ന് ", + "രണ്ട് ", + "മൂന്ന് ", + "നാല്‌ ", + "അഞ്ച് ", + "ആറ് ", + "ഏഴ് ", + "എട്ട് ", + "ഒന്‍പത് ", + "പത്ത് ", + "പതിനൊന്ന്", + "പന്ത്രണ്ട്", + "പതി മൂന്നു", + "പതിനാല്", + "പതിനഞ്ച്", + "പതിനാറ്", + "പതിനേഴ്", + "പതിനെട്ട്", + "പത്തൊമ്പതു", + "ഇരുപത്", + "ഇരുപത്തിഒന്ന്", + "ഇരുപത്തിരണ്ട്‌", + "ഇരുപത്തിമൂന്ന്", + "ഇരുപത്തിനാല്", + "ഇരുപത്തിഅഞ്ചു", + "ഇരുപത്തിആറ്", + "ഇരുപത്തിഏഴ്", + "ഇരുപത്തിഎട്ടു", + "ഇരുപത്തിഒന്‍പത്", + "മുപ്പത്", + "മുപ്പത്തിഒന്ന്", + "മുപ്പത്തിരണ്ട്", + "മുപ്പത്തിമൂന്ന്", + "മുപ്പത്തിനാല്", + "മുപ്പത്തിഅഞ്ചു", + "മുപ്പത്തിആറ്", + "മുപ്പത്തിഏഴ്", + "മുപ്പത്തിഎട്ട്", + "മുപ്പത്തിഒന്‍പതു", + "നാല്‍പത്‌ ", + "അന്‍പത് ", + "അറുപത് ", + "എഴുപത് ", + "എണ്‍പത് ", + "തൊണ്ണൂറ് ", + "നുറ് ", + "ആയിരം ", + "പത്തുലക്ഷം" +] + + +def like_num(text): + """ + Check if text resembles a number + """ + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py new file mode 100644 index 000000000..4012571bc --- /dev/null +++ b/spacy/lang/ml/stop_words.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set( + + """ +അത് +ഇത് +ആയിരുന്നു +ആകുന്നു +വരെ +അന്നേരം +അന്ന് +ഇന്ന് +ആണ് +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 0f14f0a27..2ba759a29 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -150,6 +150,11 @@ def lt_tokenizer(): return get_lang_class("lt").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ml_tokenizer(): + return get_lang_class("ml").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def nb_tokenizer(): return get_lang_class("nb").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py new file mode 100644 index 000000000..92eca6b21 --- /dev/null +++ b/spacy/tests/lang/ml/test_text.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_ml_tokenizer_handles_long_text(ml_tokenizer): + text = """അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക""" + tokens = ml_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)]) +def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length): + tokens = ml_tokenizer(text) + assert len(tokens) == length From 90c754024f079e0b7842acb826cc253db17c3cb3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 10:53:05 +0200 Subject: [PATCH 082/105] Update nlp.vectors to nlp.vocab.vectors (#5357) --- website/docs/api/vectors.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 93e747c1e..a4c36f8cd 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -111,7 +111,7 @@ Check whether a key has been mapped to a vector entry in the table. > > ```python > cat_id = nlp.vocab.strings["cat"] -> nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) +> nlp.vocab.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) > assert cat_id in vectors > ``` @@ -315,7 +315,7 @@ performed in chunks, to avoid consuming too much memory. You can set the > > ```python > queries = numpy.asarray([numpy.random.uniform(-1, 1, (300,))]) -> most_similar = nlp.vectors.most_similar(queries, n=10) +> most_similar = nlp.vocab.vectors.most_similar(queries, n=10) > ``` | Name | Type | Description | From b2b7e1f37a1c9e9312006b39bfd3051ba83e1750 Mon Sep 17 00:00:00 2001 From: Punitvara Date: Mon, 27 Apr 2020 14:37:37 +0530 Subject: [PATCH 083/105] This PR adds Gujarati Language class along with (#5355) * This PR adds Gujarati Language class along with - stop words * Add test for gu tokenizer --- .github/contributors/punitvara.md | 107 ++++++++++++++++++++++++++++++ spacy/lang/gu/__init__.py | 18 +++++ spacy/lang/gu/examples.py | 22 ++++++ spacy/lang/gu/stop_words.py | 91 +++++++++++++++++++++++++ spacy/tests/conftest.py | 4 ++ spacy/tests/lang/gu/test_text.py | 20 ++++++ 6 files changed, 262 insertions(+) create mode 100644 .github/contributors/punitvara.md create mode 100644 spacy/lang/gu/__init__.py create mode 100644 spacy/lang/gu/examples.py create mode 100644 spacy/lang/gu/stop_words.py create mode 100644 spacy/tests/lang/gu/test_text.py diff --git a/.github/contributors/punitvara.md b/.github/contributors/punitvara.md new file mode 100644 index 000000000..dde810453 --- /dev/null +++ b/.github/contributors/punitvara.md @@ -0,0 +1,107 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Punit Vara | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-26 | +| GitHub username | punitvara | +| Website (optional) | https://punitvara.com | + diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py new file mode 100644 index 000000000..1f080c7c2 --- /dev/null +++ b/spacy/lang/gu/__init__.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + +from ...language import Language + + +class GujaratiDefaults(Language.Defaults): + stop_words = STOP_WORDS + + +class Gujarati(Language): + lang = "gu" + Defaults = GujaratiDefaults + + +__all__ = ["Gujarati"] diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py new file mode 100644 index 000000000..202a8d022 --- /dev/null +++ b/spacy/lang/gu/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.gu.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.", + "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું", + "કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો", + "તેજપાળને બે પત્ની હતી", + "ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો", + "આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.", + "અહિયાં શું જોડાય છે?", + "મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.", +] diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py new file mode 100644 index 000000000..f641b5720 --- /dev/null +++ b/spacy/lang/gu/stop_words.py @@ -0,0 +1,91 @@ +# coding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set( + """ +એમ +આ +એ +રહી +છે +છો +હતા +હતું +હતી +હોય +હતો +શકે +તે +તેના +તેનું +તેને +તેની +તેઓ +તેમને +તેમના +તેમણે +તેમનું +તેમાં +અને +અહીં +થી +થઈ +થાય +જે + ને +કે +ના +ની +નો +ને +નું +શું +માં +પણ +પર +જેવા +જેવું +જાય +જેમ +જેથી +માત્ર +માટે +પરથી +આવ્યું +એવી +આવી +રીતે +સુધી +થાય +થઈ +સાથે +લાગે +હોવા +છતાં +રહેલા +કરી +કરે +કેટલા +કોઈ +કેમ +કર્યો +કર્યુ +કરે +સૌથી +ત્યારબાદ +તથા +દ્વારા +જુઓ +જાઓ +જ્યારે +ત્યારે +શકો +નથી +હવે +અથવા +થતો +દર +એટલો +પરંતુ +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 2ba759a29..e52c5155f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -103,6 +103,10 @@ def ga_tokenizer(): return get_lang_class("ga").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def gu_tokenizer(): + return get_lang_class("gu").Defaults.create_tokenizer() + @pytest.fixture(scope="session") def he_tokenizer(): return get_lang_class("he").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py new file mode 100644 index 000000000..9f3ae45a4 --- /dev/null +++ b/spacy/tests/lang/gu/test_text.py @@ -0,0 +1,20 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_gu_tokenizer_handlers_long_text(gu_tokenizer): + text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે""" + tokens = gu_tokenizer(text) + assert len(tokens) == 9 + +@pytest.mark.parametrize( + "text,length", + [ + ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), + ("ખેતરની ખેડ કરવામાં આવે છે.", 5), + ], +) +def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length): + tokens = gu_tokenizer(text) + assert len(tokens) == length From 9203d821ae798b67d84e42b319a310b876f3dc93 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 27 Apr 2020 13:01:54 +0200 Subject: [PATCH 084/105] Add 2 ini files in tests/lang (#5359) --- spacy/tests/lang/gu/__init__.py | 0 spacy/tests/lang/ml/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 spacy/tests/lang/gu/__init__.py create mode 100644 spacy/tests/lang/ml/__init__.py diff --git a/spacy/tests/lang/gu/__init__.py b/spacy/tests/lang/gu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ml/__init__.py b/spacy/tests/lang/ml/__init__.py new file mode 100644 index 000000000..e69de29bb From f8ac5b9f563050472aedc719950b4888c65ca4cc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 16:51:27 +0200 Subject: [PATCH 085/105] bugfix in span similarity (#5155) (#5358) * bugfix in span similarity * also rewrite doc.pyx for clarity * formatting Co-authored-by: Sofie Van Landeghem --- spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++ spacy/tokens/doc.pyx | 15 ++++++++------- spacy/tokens/span.pyx | 6 ++++-- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 spacy/tests/regression/test_issue5152.py diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py new file mode 100644 index 000000000..a9a57746d --- /dev/null +++ b/spacy/tests/regression/test_issue5152.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + assert span_2.similarity(span_3) < 1.0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ec0cd66b8..f27115e6f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -387,13 +387,14 @@ cdef class Doc: if isinstance(other, (Lexeme, Token)) and self.length == 1: if self.c[0].lex.orth == other.orth: return 1.0 - elif isinstance(other, (Span, Doc)): - if len(self) == len(other): - for i in range(self.length): - if self[i].orth != other[i].orth: - break - else: - return 1.0 + elif isinstance(other, (Span, Doc)) and len(self) == len(other): + similar = True + for i in range(self.length): + if self[i].orth != other[i].orth: + similar = False + break + if similar: + return 1.0 if self.vocab.vectors.n_keys == 0: models_warning(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 35c70f236..9269700b0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -324,11 +324,13 @@ cdef class Span: if len(self) == 1 and hasattr(other, "orth"): if self[0].orth == other.orth: return 1.0 - elif hasattr(other, "__len__") and len(self) == len(other): + elif isinstance(other, (Doc, Span)) and len(self) == len(other): + similar = True for i in range(len(self)): if self[i].orth != getattr(other[i], "orth", None): + similar = False break - else: + if similar: return 1.0 if self.vocab.vectors.n_keys == 0: models_warning(Warnings.W007.format(obj="Span")) From 792aa7b6ab48ad40254102e5730c420e36822a70 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 18:01:12 +0200 Subject: [PATCH 086/105] Remove references to textcat spans (#5360) Remove references to unimplemented `TextCategorizer` span labels in `GoldParse` and `Doc`. --- website/docs/api/doc.md | 2 +- website/docs/api/goldparse.md | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index ab85c1deb..7decc2278 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -653,7 +653,7 @@ The L2 norm of the document's vector representation. | `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `vocab` | `Vocab` | The store of lexical types. | | `tensor` 2 | `ndarray` | Container for dense vector representations. | -| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | +| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | | `lang_` 2.1 | unicode | Language of the document's vocabulary. | diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 1ef6f0362..443913311 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -7,12 +7,10 @@ source: spacy/gold.pyx ## GoldParse.\_\_init\_\_ {#init tag="method"} -Create a `GoldParse`. Unlike annotations in `entities`, label annotations in -`cats` can overlap, i.e. a single word can be covered by multiple labelled -spans. The [`TextCategorizer`](/api/textcategorizer) component expects true -examples of a label to have the value `1.0`, and negative examples of a label to -have the value `0.0`. Labels not in the dictionary are treated as missing – the -gradient for those labels will be zero. +Create a `GoldParse`. The [`TextCategorizer`](/api/textcategorizer) component +expects true examples of a label to have the value `1.0`, and negative examples +of a label to have the value `0.0`. Labels not in the dictionary are treated as +missing – the gradient for those labels will be zero. | Name | Type | Description | | ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -22,8 +20,8 @@ gradient for those labels will be zero. | `heads` | iterable | A sequence of integers, representing syntactic head offsets. | | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | -| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | +| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -53,7 +51,7 @@ Whether the provided syntactic annotations form a projective dependency tree. | `ner` | list | The named entity annotations as BILUO tags. | | `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | | `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | -| `cats` 2 | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. | +| `cats` 2 | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | | `links` 2.2 | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | ## Utilities {#util} From 5b5528ff2edb8aad2c133a1a2473a279a27e8b8a Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 27 Apr 2020 20:02:09 +0000 Subject: [PATCH 087/105] Add `!=3.4.*` to python_requires (#5344) Missed in 80d554f2e2813aea41b0889b39d8f30f648af1ad --- .github/contributors/michael-k.md | 106 ++++++++++++++++++++++++++++++ setup.cfg | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/michael-k.md diff --git a/.github/contributors/michael-k.md b/.github/contributors/michael-k.md new file mode 100644 index 000000000..4ecc5be85 --- /dev/null +++ b/.github/contributors/michael-k.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Michael Käufl | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-23 | +| GitHub username | michael-k | +| Website (optional) | | diff --git a/setup.cfg b/setup.cfg index 465367ff6..722adc0e2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ zip_safe = false include_package_data = true scripts = bin/spacy -python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.* setup_requires = wheel cython>=0.25 From bc39f97e11a150b77f54b36b0e862aee2555380e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Apr 2020 13:37:37 +0200 Subject: [PATCH 088/105] Simplify warnings --- spacy/__init__.py | 4 +- spacy/_ml.py | 7 ++-- spacy/analysis.py | 6 ++- spacy/cli/init_model.py | 4 +- spacy/displacy/__init__.py | 8 ++-- spacy/errors.py | 69 +-------------------------------- spacy/gold.pyx | 7 ++-- spacy/kb.pyx | 12 +++--- spacy/language.py | 15 +++---- spacy/lexeme.pyx | 5 ++- spacy/matcher/matcher.pyx | 5 ++- spacy/matcher/phrasematcher.pyx | 12 +++--- spacy/pipeline/pipes.pyx | 5 ++- spacy/tests/doc/test_doc_api.py | 3 +- spacy/tests/doc/test_span.py | 3 +- spacy/tokenizer.pyx | 7 ++-- spacy/tokens/doc.pyx | 14 +++---- spacy/tokens/span.pyx | 10 ++--- spacy/tokens/token.pyx | 7 ++-- spacy/util.py | 4 +- 20 files changed, 76 insertions(+), 131 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 4a0d16a49..6aa7b7c16 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -13,7 +13,7 @@ from . import pipeline from .cli.info import info as cli_info from .glossary import explain from .about import __version__ -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings from . import util from .util import registry from .language import component @@ -26,7 +26,7 @@ if sys.maxunicode == 65535: def load(name, **overrides): depr_path = overrides.get("path") if depr_path not in (True, False, None): - deprecation_warning(Warnings.W001.format(path=depr_path)) + warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning) return util.load_model(name, **overrides) diff --git a/spacy/_ml.py b/spacy/_ml.py index 2a758accc..5cccabac1 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import numpy +import warnings from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu from thinc.t2t import ExtractWindow, ParametricAttention from thinc.t2v import Pooling, sum_pool, mean_pool @@ -22,7 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed import thinc.extra.load_nlp from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE -from .errors import Errors, user_warning, Warnings +from .errors import Errors, Warnings from . import util from . import ml as new_ml from .ml import _legacy_tok2vec @@ -283,7 +284,7 @@ def link_vectors_to_models(vocab): if vectors.name is None: vectors.name = VECTORS_KEY if vectors.data.size != 0: - user_warning(Warnings.W020.format(shape=vectors.data.shape)) + warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) ops = Model.ops for word in vocab: if word.orth in vectors.key2row: @@ -299,7 +300,7 @@ def link_vectors_to_models(vocab): # This is a hack to avoid the problem in #3853. old_name = vectors.name new_name = vectors.name + "_%d" % data.shape[0] - user_warning(Warnings.W019.format(old=old_name, new=new_name)) + warnings.warn(Warnings.W019.format(old=old_name, new=new_name)) vectors.name = new_name key = (ops.device, vectors.name) thinc.extra.load_nlp.VECTORS[key] = data diff --git a/spacy/analysis.py b/spacy/analysis.py index 761be3de9..960ce6c0f 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -1,11 +1,13 @@ # coding: utf8 from __future__ import unicode_literals +import warnings + from collections import OrderedDict from wasabi import Printer from .tokens import Doc, Token, Span -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings def analyze_pipes(pipeline, name, pipe, index, warn=True): @@ -34,7 +36,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): if not fulfilled: problems.append(annot) if warn: - user_warning(Warnings.W025.format(name=name, attr=annot)) + warnings.warn(Warnings.W025.format(name=name, attr=annot)) return problems diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 0bdd4000e..32d2d974e 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -15,7 +15,7 @@ import srsly from wasabi import msg from ..vectors import Vectors -from ..errors import Errors, Warnings, user_warning +from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class, OOV_RANK try: @@ -246,7 +246,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_clusters(clusters_loc): clusters = {} if ftfy is None: - user_warning(Warnings.W004) + warnings.warn(Warnings.W004) with clusters_loc.open() as f: for line in tqdm(f): try: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 922d80e57..8a6ec2f53 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -10,7 +10,7 @@ from __future__ import unicode_literals from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span from ..compat import b_to_str -from ..errors import Errors, Warnings, user_warning +from ..errors import Errors, Warnings from ..util import is_in_jupyter @@ -89,7 +89,7 @@ def serve( from wsgiref import simple_server if is_in_jupyter(): - user_warning(Warnings.W011) + warnings.warn(Warnings.W011) render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) @@ -119,7 +119,7 @@ def parse_deps(orig_doc, options={}): """ doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) if not doc.is_parsed: - user_warning(Warnings.W005) + warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): with doc.retokenize() as retokenizer: for np in list(doc.noun_chunks): @@ -184,7 +184,7 @@ def parse_ents(doc, options={}): for ent in doc.ents ] if not ents: - user_warning(Warnings.W006) + warnings.warn(Warnings.W006) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None settings = get_doc_settings(doc) return {"text": doc.text, "ents": ents, "title": title, "settings": settings} diff --git a/spacy/errors.py b/spacy/errors.py index e52241be1..664c0a2fc 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,11 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import os -import warnings -import inspect - - def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -93,8 +88,7 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data, " - "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " + "or the language you're using doesn't have lemmatization data. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " @@ -593,64 +587,3 @@ class MatchPatternError(ValueError): class AlignmentError(ValueError): pass - - -class ModelsWarning(UserWarning): - pass - - -WARNINGS = { - "user": UserWarning, - "deprecation": DeprecationWarning, - "models": ModelsWarning, -} - - -def _get_warn_types(arg): - if arg == "": # don't show any warnings - return [] - if not arg or arg == "all": # show all available warnings - return WARNINGS.keys() - return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS] - - -def _get_warn_excl(arg): - if not arg: - return [] - return [w_id.strip() for w_id in arg.split(",")] - - -SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER") -SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES")) -SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE")) - - -def user_warning(message): - _warn(message, "user") - - -def deprecation_warning(message): - _warn(message, "deprecation") - - -def models_warning(message): - _warn(message, "models") - - -def _warn(message, warn_type="user"): - """ - message (unicode): The message to display. - category (Warning): The Warning to show. - """ - if message.startswith("["): - w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string - else: - w_id = None - ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE - if warn_type in SPACY_WARNING_TYPES and not ignore_warning: - category = WARNINGS[warn_type] - stack = inspect.stack()[-1] - with warnings.catch_warnings(): - if SPACY_WARNING_FILTER: - warnings.simplefilter(SPACY_WARNING_FILTER, category) - warnings.warn_explicit(message, category, stack[1], stack[2]) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 8b61de683..e8274563f 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -10,10 +10,11 @@ import shutil import itertools from pathlib import Path import srsly +import warnings from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError, user_warning, Warnings +from .errors import Errors, AlignmentError, Warnings from .compat import path2str from . import util from .util import minibatch, itershuffle @@ -508,7 +509,7 @@ def _json_iterate(loc): py_raw = file_.read() cdef long file_length = len(py_raw) if file_length > 2 ** 30: - user_warning(Warnings.W027.format(size=file_length)) + warnings.warn(Warnings.W027.format(size=file_length)) raw = py_raw cdef int square_depth = 0 @@ -690,7 +691,7 @@ cdef class GoldParse: else: words_offset -= 1 if len(entities) != len(words): - user_warning(Warnings.W029.format(text=doc.text)) + warnings.warn(Warnings.W029.format(text=doc.text)) entities = ["-" for _ in words] # These are filled by the tagger/parser/entity recogniser diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..36a6dbd93 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,7 +1,9 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 -from spacy.errors import Errors, Warnings, user_warning +import warnings + +from spacy.errors import Errors, Warnings from pathlib import Path from cymem.cymem cimport Pool @@ -115,7 +117,7 @@ cdef class KnowledgeBase: # Return if this entity was added before if entity_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity)) + warnings.warn(Warnings.W018.format(entity=entity)) return # Raise an error if the provided entity vector is not of the correct length @@ -147,7 +149,7 @@ cdef class KnowledgeBase: # only process this entity if its unique ID hadn't been added before entity_hash = self.vocab.strings.add(entity_list[i]) if entity_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity_list[i])) + warnings.warn(Warnings.W018.format(entity=entity_list[i])) else: entity_vector = vector_list[i] @@ -195,7 +197,7 @@ cdef class KnowledgeBase: # Check whether this alias was added before if alias_hash in self._alias_index: - user_warning(Warnings.W017.format(alias=alias)) + warnings.warn(Warnings.W017.format(alias=alias)) return cdef vector[int64_t] entry_indices @@ -252,7 +254,7 @@ cdef class KnowledgeBase: if is_present: if not ignore_warnings: - user_warning(Warnings.W024.format(entity=entity, alias=alias)) + warnings.warn(Warnings.W024.format(entity=entity, alias=alias)) else: entry_indices.push_back(int(entry_index)) alias_entry.entry_indices = entry_indices diff --git a/spacy/language.py b/spacy/language.py index f5eff2ae9..e89f80f08 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,6 +3,7 @@ from __future__ import absolute_import, unicode_literals import random import itertools +import warnings from thinc.extra import load_nlp @@ -34,7 +35,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings from . import util from . import about @@ -758,10 +759,10 @@ class Language(object): DOCS: https://spacy.io/api/language#pipe """ if is_python2 and n_process != 1: - user_warning(Warnings.W023) + warnings.warn(Warnings.W023) n_process = 1 if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if n_process == -1: n_process = mp.cpu_count() if as_tuples: @@ -896,7 +897,7 @@ class Language(object): DOCS: https://spacy.io/api/language#to_disk """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) serializers = OrderedDict() @@ -929,7 +930,7 @@ class Language(object): DOCS: https://spacy.io/api/language#from_disk """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() @@ -964,7 +965,7 @@ class Language(object): DOCS: https://spacy.io/api/language#to_bytes """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable serializers = OrderedDict() serializers["vocab"] = lambda: self.vocab.to_bytes() @@ -989,7 +990,7 @@ class Language(object): DOCS: https://spacy.io/api/language#from_bytes """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = OrderedDict() deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 21644e37b..a081ffe42 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -9,6 +9,7 @@ cimport numpy as np np.import_array() import numpy +import warnings from thinc.neural.util import get_array_module from libc.stdint cimport UINT64_MAX @@ -19,7 +20,7 @@ from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from .attrs cimport IS_CURRENCY, IS_OOV, PROB from .attrs import intify_attrs -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings OOV_RANK = UINT64_MAX @@ -130,7 +131,7 @@ cdef class Lexeme: if self.c.orth == other[0].orth: return 1.0 if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Lexeme")) + warnings.warn(Warnings.W008.format(obj="Lexeme")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 9e0fe2812..7f3c3488f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -9,6 +9,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly +import warnings from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -20,7 +21,7 @@ from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA from ._schemas import TOKEN_PATTERN_SCHEMA from ..util import get_json_validator, validate_json -from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning +from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS @@ -195,7 +196,7 @@ cdef class Matcher: YIELDS (Doc): Documents, in order. """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in docs: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4de5782f9..b66ec35b8 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -6,13 +6,15 @@ from libc.stdint cimport uintptr_t from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter +import warnings + from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..structs cimport TokenC from ..tokens.token cimport Token from ..typedefs cimport attr_t from ._schemas import TOKEN_PATTERN_SCHEMA -from ..errors import Errors, Warnings, deprecation_warning, user_warning +from ..errors import Errors, Warnings cdef class PhraseMatcher: @@ -39,7 +41,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#init """ if max_length != 0: - deprecation_warning(Warnings.W010) + warnings.warn(Warnings.W010, DeprecationWarning) self.vocab = vocab self._callbacks = {} self._docs = {} @@ -195,7 +197,7 @@ cdef class PhraseMatcher: if self._validate and (doc.is_tagged or doc.is_parsed) \ and self.attr not in (DEP, POS, TAG, LEMMA): string_attr = self.vocab.strings[self.attr] - user_warning(Warnings.W012.format(key=key, attr=string_attr)) + warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) keyword = self._convert_to_array(doc) else: keyword = doc @@ -204,7 +206,7 @@ cdef class PhraseMatcher: current_node = self.c_map for token in keyword: if token == self._terminal_hash: - user_warning(Warnings.W021) + warnings.warn(Warnings.W021) break result = map_get(current_node, token) if not result: @@ -306,7 +308,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#pipe """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in stream: matches = self(doc) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f2a86d56e..982c058b4 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -6,6 +6,7 @@ from __future__ import unicode_literals import numpy import srsly import random +import warnings from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax @@ -32,7 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss -from ..errors import Errors, TempErrors, user_warning, Warnings +from ..errors import Errors, TempErrors, Warnings from .. import util @@ -514,7 +515,7 @@ class Tagger(Pipe): **kwargs): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): - user_warning(Warnings.W022) + warnings.warn(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 19d908529..6801d7844 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -6,7 +6,6 @@ import pytest import numpy from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.errors import ModelsWarning from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP from ..util import get_doc @@ -216,7 +215,7 @@ def test_doc_api_similarity_match(): assert doc.similarity(doc[0]) == 1.0 assert doc.similarity(doc.vocab["a"]) == 1.0 doc2 = Doc(doc.vocab, words=["a", "b", "c"]) - with pytest.warns(ModelsWarning): + with pytest.warns(UserWarning): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0 diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 917f22e9c..e76ca4697 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -5,7 +5,6 @@ import pytest from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.errors import ModelsWarning from spacy.util import filter_spans from ..util import get_doc @@ -124,7 +123,7 @@ def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) span1 = doc[:2] span2 = doc[2:] - with pytest.warns(ModelsWarning): + with pytest.warns(UserWarning): assert span1.similarity(span2) == 1.0 assert span1.similarity(doc) == 0.0 assert span1[:1].similarity(doc.vocab["a"]) == 1.0 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 62b8bbf4a..69d6285e1 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,6 +11,7 @@ cimport cython from collections import OrderedDict import re +import warnings from .tokens.doc cimport Doc from .strings cimport hash_string @@ -18,7 +19,7 @@ from .compat import unescape_unicode, basestring_ from .attrs import intify_attrs from .symbols import ORTH -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings from . import util @@ -115,7 +116,7 @@ cdef class Tokenizer: return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): - deprecation_warning(Warnings.W002) + warnings.warn(Warnings.W002, DeprecationWarning) return Doc(self.vocab, words=strings) @cython.boundscheck(False) @@ -181,7 +182,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#pipe """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) for text in texts: yield self(text) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f27115e6f..867c2bf6b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,6 +16,7 @@ import numpy.linalg import struct import srsly from thinc.neural.util import get_array_module, copy_array +import warnings from .span cimport Span from .token cimport Token @@ -29,7 +30,6 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS from ..util import normalize_slice from ..compat import is_config, copy_reg, pickle, basestring_ -from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings from .. import util from .underscore import Underscore, get_ext_args @@ -396,9 +396,9 @@ cdef class Doc: if similar: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Doc")) + warnings.warn(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Doc")) + warnings.warn(Warnings.W008.format(obj="Doc")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -787,7 +787,7 @@ cdef class Doc: attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] if array.dtype != numpy.uint64: - user_warning(Warnings.W028.format(type=array.dtype)) + warnings.warn(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -1040,10 +1040,10 @@ cdef class Doc: indices did not fall at token boundaries. """ cdef unicode tag, lemma, ent_type - deprecation_warning(Warnings.W013.format(obj="Doc")) + warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning) # TODO: ENT_KB_ID ? if len(args) == 3: - deprecation_warning(Warnings.W003) + warnings.warn(Warnings.W003, DeprecationWarning) tag, lemma, ent_type = args attributes[TAG] = tag attributes[LEMMA] = lemma @@ -1183,7 +1183,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: while not heads_within_sents: heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) if loop_count > 10: - user_warning(Warnings.W026) + warnings.warn(Warnings.W026) break loop_count += 1 # Set sentence starts diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9269700b0..347916a0a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -6,6 +6,7 @@ from libc.math cimport sqrt import numpy import numpy.linalg +import warnings from thinc.neural.util import get_array_module from collections import defaultdict @@ -21,8 +22,7 @@ from ..symbols cimport dep from ..util import normalize_slice from ..compat import is_config, basestring_ -from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning -from ..errors import deprecation_warning +from ..errors import Errors, TempErrors, Warnings from .underscore import Underscore, get_ext_args @@ -292,7 +292,7 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ - deprecation_warning(Warnings.W013.format(obj="Span")) + warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning) return self.doc.merge(self.start_char, self.end_char, *args, **attributes) @@ -333,9 +333,9 @@ cdef class Span: if similar: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Span")) + warnings.warn(Warnings.W007.format(obj="Span")) if self.vector_norm == 0.0 or other.vector_norm == 0.0: - user_warning(Warnings.W008.format(obj="Span")) + warnings.warn(Warnings.W008.format(obj="Span")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8019e3b4f..efd9aa10b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -10,6 +10,7 @@ cimport numpy as np np.import_array() import numpy +import warnings from thinc.neural.util import get_array_module from ..typedefs cimport hash_t @@ -24,7 +25,7 @@ from ..symbols cimport conj from .. import parts_of_speech from .. import util from ..compat import is_config -from ..errors import Errors, Warnings, user_warning, models_warning +from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args from .morphanalysis cimport MorphAnalysis @@ -211,9 +212,9 @@ cdef class Token: if self.c.lex.orth == other.orth: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Token")) + warnings.warn(Warnings.W007.format(obj="Token")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Token")) + warnings.warn(Warnings.W008.format(obj="Token")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/util.py b/spacy/util.py index a5e27a210..7f2e0058f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -30,7 +30,7 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import import_file -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings _data_path = Path(__file__).parent / "data" @@ -749,7 +749,7 @@ def get_serialization_exclude(serializers, exclude, kwargs): options = [name.split(".")[0] for name in serializers] for key, value in kwargs.items(): if key in ("vocab",) and value is False: - deprecation_warning(Warnings.W015.format(arg=key)) + warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning) exclude.append(key) elif key.split(".")[0] in options: raise ValueError(Errors.E128.format(arg=key)) From 3a045572ed1608daa90dc92229c2da0524fa7f20 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Apr 2020 13:48:37 +0200 Subject: [PATCH 089/105] Add missing import --- spacy/displacy/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 8a6ec2f53..a0cccbbde 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -7,6 +7,8 @@ USAGE: https://spacy.io/usage/visualizers """ from __future__ import unicode_literals +import warnings + from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span from ..compat import b_to_str From ac40a8f7a53a29865707a4732e35c8675f1b1abb Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 28 Apr 2020 14:00:11 +0200 Subject: [PATCH 090/105] Add missing import --- spacy/cli/init_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 32d2d974e..2e0aeb239 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -12,6 +12,7 @@ import tarfile import gzip import zipfile import srsly +import warnings from wasabi import msg from ..vectors import Vectors From d5f18f83077487011f794444bbdf873b3bca7271 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 28 Apr 2020 14:01:29 +0200 Subject: [PATCH 091/105] Add missing import --- spacy/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/util.py b/spacy/util.py index 7f2e0058f..609c0b572 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -16,6 +16,7 @@ import numpy import srsly import catalogue import sys +import warnings try: import jsonschema From a27c4014f557814854bd0324e0355603de29b8b3 Mon Sep 17 00:00:00 2001 From: Louis Guitton Date: Wed, 29 Apr 2020 10:18:03 +0200 Subject: [PATCH 092/105] Add mlflow to spaCy universe (#5352) * Add mlflow to universe * Use mlflow black logo --- .github/contributors/louisguitton.md | 106 +++++++++++++++++++++++++++ website/meta/universe.json | 35 +++++++++ 2 files changed, 141 insertions(+) create mode 100644 .github/contributors/louisguitton.md diff --git a/.github/contributors/louisguitton.md b/.github/contributors/louisguitton.md new file mode 100644 index 000000000..8c5f30df6 --- /dev/null +++ b/.github/contributors/louisguitton.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Louis Guitton | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-04-25 | +| GitHub username | louisguitton | +| Website (optional) | https://guitton.co/ | diff --git a/website/meta/universe.json b/website/meta/universe.json index 8da96a026..bd3191492 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2117,6 +2117,41 @@ "github": "thomasthiebaud" }, "category": ["pipeline"] + }, + { + "id": "mlflow", + "title": "MLflow", + "slogan": "An open source platform for the machine learning lifecycle", + "description": "MLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. MLflow currently offers four components: Tracking, Projects, Models and Registry.", + "github": "mlflow/mlflow", + "pip": "mlflow", + "thumb": "https://www.mlflow.org/docs/latest/_static/MLflow-logo-final-black.png", + "image": "", + "url": "https://mlflow.org/", + "author": "Databricks", + "author_links": { + "github": "databricks", + "twitter": "databricks", + "website": "https://databricks.com/" + }, + "category": ["standalone", "apis"], + "code_example": [ + "import mlflow", + "import mlflow.spacy", + "", + "# MLflow Tracking", + "nlp = spacy.load('my_best_model_path/output/model-best')", + "with mlflow.start_run(run_name='Spacy'):", + " mlflow.set_tag('model_flavor', 'spacy')", + " mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model')", + " mlflow.log_metric(('accuracy', 0.72))", + " my_run_id = mlflow.active_run().info.run_id", + "", + "", + "# MLflow Models", + "model_uri = f'runs:/{my_run_id}/model'", + "nlp2 = mlflow.spacy.load_model(model_uri=model_uri)" + ] } ], From 90ce34db42bedac8fa6b3d614d8bc568b883d6da Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:51:12 +0200 Subject: [PATCH 093/105] Add cuda101 and cuda102 options to setup (#5377) * Add cuda101 and cuda102 options to setup * Update cudaNNN options in docs --- setup.cfg | 4 ++++ website/docs/usage/index.md | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 722adc0e2..3e0acd12f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,6 +72,10 @@ cuda92 = cupy-cuda92>=5.0.0b4,<9.0.0 cuda100 = cupy-cuda100>=5.0.0b4,<9.0.0 +cuda101 = + cupy-cuda101>=5.0.0b4,<9.0.0 +cuda102 = + cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = fugashi>=0.1.3 diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 17fd8fa7b..d0172104b 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -122,10 +122,10 @@ support, we've been grateful to use the work of Chainer's interface for GPU arrays. spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`, -`spacy[cuda91]`, `spacy[cuda92]` or `spacy[cuda100]`. If you know your cuda -version, using the more explicit specifier allows cupy to be installed via -wheel, saving some compilation time. The specifiers should install -[`cupy`](https://cupy.chainer.org). +`spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]` or +`spacy[cuda102]`. If you know your cuda version, using the more explicit +specifier allows cupy to be installed via wheel, saving some compilation time. +The specifiers should install [`cupy`](https://cupy.chainer.org). ```bash $ pip install -U spacy[cuda92] From 732629b0dd8ab4db2b5446aa246ebe65f30ae2c2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Apr 2020 12:51:37 +0200 Subject: [PATCH 094/105] Update website/meta/universe.json --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 6c9fc0340..139f1e8e8 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1623,7 +1623,7 @@ "id": "pic2phrase_bot", "title": "pic2phrase_bot: Photo Description Generator", "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", - "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy." + "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy.", "thumb": "https://i.imgur.com/ggVI02O.jpg", "image": "https://i.imgur.com/z1yhWQR.jpg", "url": "https://telegram.me/pic2phrase_bot", From 1cbb272a6b468f1704f00f00d126104eb4ddec12 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Apr 2020 12:51:44 +0200 Subject: [PATCH 095/105] Update website/meta/universe.json --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 139f1e8e8..8c8274700 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1631,7 +1631,7 @@ "author_links": { "twitter": "VasilievYuli", }, - "category": ["standalone", "research"] + "category": ["standalone", "conversational"] }, { "id": "gracyql", From a6e521cd7919ed16b6bcc089aadbac8b5d160fd1 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:53:16 +0200 Subject: [PATCH 096/105] Add is_sent_end token property (#5375) Reconstruction of the original PR #4697 by @MiniLau. Removes unused `SENT_END` symbol and `IS_SENT_END` from `Matcher` schema because the Matcher is only going to be able to support `IS_SENT_START`. --- .github/contributors/MiniLau.md | 106 +++++++++++++++++++++++ spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/errors.py | 2 + spacy/structs.pxd | 2 +- spacy/symbols.pxd | 2 +- spacy/tests/doc/test_token_api.py | 14 +++ spacy/tests/pipeline/test_sentencizer.py | 17 +++- spacy/tokens/token.pyx | 22 +++++ website/docs/api/token.md | 17 +++- 10 files changed, 177 insertions(+), 7 deletions(-) create mode 100644 .github/contributors/MiniLau.md diff --git a/.github/contributors/MiniLau.md b/.github/contributors/MiniLau.md new file mode 100644 index 000000000..14d6fe328 --- /dev/null +++ b/.github/contributors/MiniLau.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Desausoi Laurent | +| Company name (if applicable) | / | +| Title or role (if applicable) | / | +| Date | 22 November 2019 | +| GitHub username | MiniLau | +| Website (optional) | / | diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 4638fcb82..8f583b3a3 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -94,3 +94,4 @@ cdef enum attr_id_t: ENT_ID = symbols.ENT_ID IDX + SENT_END \ No newline at end of file diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index f14cd6ddc..2187f3c65 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -88,6 +88,7 @@ IDS = { "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, + "SENT_END": SENT_END, "SPACY": SPACY, "PROB": PROB, "LANG": LANG, diff --git a/spacy/errors.py b/spacy/errors.py index e52241be1..6191570ee 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -559,6 +559,8 @@ class Errors(object): "({curr_dim}).") E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") E195 = ("Matcher can be called on {good} only, got {got}.") + E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " + "only be fixed with token.is_sent_start.") @add_codes diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b3878db3f..b8e63a725 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -84,7 +84,7 @@ cdef struct TokenC: cdef struct MorphAnalysisC: univ_pos_t pos int length - + attr_t abbr attr_t adp_type attr_t adv_type diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b24891fdd..9229c9970 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -464,4 +464,4 @@ cdef enum symbol_t: ENT_KB_ID ENT_ID - IDX \ No newline at end of file + IDX diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 8c749b26d..1c2253dfa 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -181,6 +181,14 @@ def test_is_sent_start(en_tokenizer): doc.is_parsed = True assert len(list(doc.sents)) == 2 +def test_is_sent_end(en_tokenizer): + doc = en_tokenizer("This is a sentence. This is another.") + assert doc[4].is_sent_end is None + doc[5].is_sent_start = True + assert doc[4].is_sent_end is True + doc.is_parsed = True + assert len(list(doc.sents)) == 2 + def test_set_pos(): doc = Doc(Vocab(), words=["hello", "world"]) @@ -205,6 +213,12 @@ def test_token0_has_sent_start_true(): assert doc[1].is_sent_start is None assert not doc.is_sentenced +def test_tokenlast_has_sent_end_true(): + doc = Doc(Vocab(), words=["hello", "world"]) + assert doc[0].is_sent_end is None + assert doc[1].is_sent_end is True + assert not doc.is_sentenced + def test_token_api_conjuncts_chain(en_vocab): words = "The boy and the girl and the man went .".split() diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index d690958cc..7e58b3e98 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -14,7 +14,9 @@ def test_sentencizer(en_vocab): doc = sentencizer(doc) assert doc.is_sentenced sent_starts = [t.is_sent_start for t in doc] + sent_ends = [t.is_sent_end for t in doc] assert sent_starts == [True, False, True, False, False, False, False] + assert sent_ends == [False, True, False, False, False, False, True] assert len(list(doc.sents)) == 2 @@ -46,13 +48,14 @@ def test_sentencizer_empty_docs(): @pytest.mark.parametrize( - "words,sent_starts,n_sents", + "words,sent_starts,sent_ends,n_sents", [ # The expected result here is that the duplicate punctuation gets merged # onto the same sentence and no one-token sentence is created for them. ( ["Hello", "!", ".", "Test", ".", ".", "ok"], [True, False, False, True, False, False, True], + [False, False, True, False, False, True, True], 3, ), # We also want to make sure ¡ and ¿ aren't treated as sentence end @@ -60,32 +63,36 @@ def test_sentencizer_empty_docs(): ( ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"], [True, False, False, False, True, False, False, False, False, False], + [False, False, False, True, False, False, False, False, False, True], 2, ), # The Token.is_punct check ensures that quotes are handled as well ( ['"', "Nice", "!", '"', "I", "am", "happy", "."], [True, False, False, False, True, False, False, False], + [False, False, False, True, False, False, False, True], 2, ), ], ) -def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): +def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer() doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts + assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents @pytest.mark.parametrize( - "punct_chars,words,sent_starts,n_sents", + "punct_chars,words,sent_starts,sent_ends,n_sents", [ ( ["~", "?"], ["Hello", "world", "~", "A", ".", "B", "."], [True, False, False, True, False, False, False], + [False, False, True, False, False, False, True], 2, ), # Even thought it's not common, the punct_chars should be able to @@ -94,16 +101,18 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): [".", "ö"], ["Hello", ".", "Test", "ö", "Ok", "."], [True, False, True, False, True, False], + [False, True, False, True, False, True], 3, ), ], ) -def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents): +def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=punct_chars) doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts + assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8019e3b4f..194f16c5a 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -493,6 +493,28 @@ cdef class Token: else: raise ValueError(Errors.E044.format(value=value)) + property is_sent_end: + """A boolean value indicating whether the token ends a sentence. + `None` if unknown. Defaults to `True` for the last token in the `Doc`. + + RETURNS (bool / None): Whether the token ends a sentence. + None if unknown. + + DOCS: https://spacy.io/api/token#is_sent_end + """ + def __get__(self): + if self.i + 1 == len(self.doc): + return True + elif self.doc[self.i+1].is_sent_start == None: + return None + elif self.doc[self.i+1].is_sent_start == True: + return True + else: + return False + + def __set__(self, value): + raise ValueError(Errors.E196) + @property def lefts(self): """The leftward immediate children of the word, in the syntactic diff --git a/website/docs/api/token.md b/website/docs/api/token.md index c30c01c20..7280ac796 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -352,7 +352,22 @@ property to `0` for the first word of the document. + assert doc[4].is_sent_start == True ``` -
+## Token.is_sent_end {#is_sent_end tag="property" new="2"} + +A boolean value indicating whether the token ends a sentence. `None` if +unknown. Defaults to `True` for the last token in the `Doc`. + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> assert doc[3].is_sent_end +> assert not doc[4].is_sent_end +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------ | +| **RETURNS** | bool | Whether the token ends a sentence. | ## Token.has_vector {#has_vector tag="property" model="vectors"} From f67343295de38be3f88360f009e99de7eb2e199c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Apr 2020 12:53:53 +0200 Subject: [PATCH 097/105] Update NEL examples and documentation (#5370) * simplify creation of KB by skipping dim reduction * small fixes to train EL example script * add KB creation and NEL training example scripts to example section * update descriptions of example scripts in the documentation * moving wiki_entity_linking folder from bin to projects * remove test for wiki NEL functionality that is being moved --- bin/wiki_entity_linking/README.md | 37 -- bin/wiki_entity_linking/__init__.py | 12 - .../entity_linker_evaluation.py | 204 ------- bin/wiki_entity_linking/kb_creator.py | 161 ----- bin/wiki_entity_linking/train_descriptions.py | 152 ----- bin/wiki_entity_linking/wiki_io.py | 127 ---- bin/wiki_entity_linking/wiki_namespaces.py | 128 ---- .../wikidata_pretrain_kb.py | 179 ------ bin/wiki_entity_linking/wikidata_processor.py | 154 ----- .../wikidata_train_entity_linker.py | 172 ------ .../wikipedia_processor.py | 565 ------------------ .../training/{pretrain_kb.py => create_kb.py} | 43 +- examples/training/train_entity_linker.py | 10 +- spacy/tests/regression/test_issue5314.py | 18 - website/docs/usage/examples.md | 21 + website/docs/usage/linguistic-features.md | 4 +- website/docs/usage/training.md | 22 +- 17 files changed, 50 insertions(+), 1959 deletions(-) delete mode 100644 bin/wiki_entity_linking/README.md delete mode 100644 bin/wiki_entity_linking/__init__.py delete mode 100644 bin/wiki_entity_linking/entity_linker_evaluation.py delete mode 100644 bin/wiki_entity_linking/kb_creator.py delete mode 100644 bin/wiki_entity_linking/train_descriptions.py delete mode 100644 bin/wiki_entity_linking/wiki_io.py delete mode 100644 bin/wiki_entity_linking/wiki_namespaces.py delete mode 100644 bin/wiki_entity_linking/wikidata_pretrain_kb.py delete mode 100644 bin/wiki_entity_linking/wikidata_processor.py delete mode 100644 bin/wiki_entity_linking/wikidata_train_entity_linker.py delete mode 100644 bin/wiki_entity_linking/wikipedia_processor.py rename examples/training/{pretrain_kb.py => create_kb.py} (75%) delete mode 100644 spacy/tests/regression/test_issue5314.py diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md deleted file mode 100644 index 4e4af5c21..000000000 --- a/bin/wiki_entity_linking/README.md +++ /dev/null @@ -1,37 +0,0 @@ -## Entity Linking with Wikipedia and Wikidata - -### Step 1: Create a Knowledge Base (KB) and training data - -Run `wikidata_pretrain_kb.py` -* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file** - * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/ - * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language) -* You can set the filtering parameters for KB construction: - * `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym - * `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB - * `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB -* Further parameters to set: - * `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`) - * `entity_vector_length` (`-v`): length of the pre-trained entity description vectors - * `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages) - -Quick testing and rerunning: -* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything. - * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1` -* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed. - - -### Step 2: Train an Entity Linking model - -Run `wikidata_train_entity_linker.py` -* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model** -* Specify the output directory (`-o`) in which the final, trained model will be saved -* You can set the learning parameters for the EL training: - * `epochs` (`-e`): number of training iterations - * `dropout` (`-p`): dropout rate - * `lr` (`-n`): learning rate - * `l2` (`-r`): L2 regularization -* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively - * If not specified, the full dataset will be processed - this may take a LONG time ! -* Further parameters to set: - * `labels_discard` (`-l`): NER label types to discard during training diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py deleted file mode 100644 index de486bbcf..000000000 --- a/bin/wiki_entity_linking/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -TRAINING_DATA_FILE = "gold_entities.jsonl" -KB_FILE = "kb" -KB_MODEL_DIR = "nlp_kb" -OUTPUT_MODEL_DIR = "nlp" - -PRIOR_PROB_PATH = "prior_prob.csv" -ENTITY_DEFS_PATH = "entity_defs.csv" -ENTITY_FREQ_PATH = "entity_freq.csv" -ENTITY_ALIAS_PATH = "entity_alias.csv" -ENTITY_DESCR_PATH = "entity_descriptions.csv" - -LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py deleted file mode 100644 index 2aeffbfc2..000000000 --- a/bin/wiki_entity_linking/entity_linker_evaluation.py +++ /dev/null @@ -1,204 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import logging -import random -from tqdm import tqdm -from collections import defaultdict - -logger = logging.getLogger(__name__) - - -class Metrics(object): - true_pos = 0 - false_pos = 0 - false_neg = 0 - - def update_results(self, true_entity, candidate): - candidate_is_correct = true_entity == candidate - - # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL") - # Therefore, if candidate_is_correct then we have a true positive and never a true negative. - self.true_pos += candidate_is_correct - self.false_neg += not candidate_is_correct - if candidate and candidate not in {"", "NIL"}: - # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN. - self.false_pos += not candidate_is_correct - - def calculate_precision(self): - if self.true_pos == 0: - return 0.0 - else: - return self.true_pos / (self.true_pos + self.false_pos) - - def calculate_recall(self): - if self.true_pos == 0: - return 0.0 - else: - return self.true_pos / (self.true_pos + self.false_neg) - - def calculate_fscore(self): - p = self.calculate_precision() - r = self.calculate_recall() - if p + r == 0: - return 0.0 - else: - return 2 * p * r / (p + r) - - -class EvaluationResults(object): - def __init__(self): - self.metrics = Metrics() - self.metrics_by_label = defaultdict(Metrics) - - def update_metrics(self, ent_label, true_entity, candidate): - self.metrics.update_results(true_entity, candidate) - self.metrics_by_label[ent_label].update_results(true_entity, candidate) - - def report_metrics(self, model_name): - model_str = model_name.title() - recall = self.metrics.calculate_recall() - precision = self.metrics.calculate_precision() - fscore = self.metrics.calculate_fscore() - return ( - "{}: ".format(model_str) - + "F-score = {} | ".format(round(fscore, 3)) - + "Recall = {} | ".format(round(recall, 3)) - + "Precision = {} | ".format(round(precision, 3)) - + "F-score by label = {}".format( - {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())} - ) - ) - - -class BaselineResults(object): - def __init__(self): - self.random = EvaluationResults() - self.prior = EvaluationResults() - self.oracle = EvaluationResults() - - def report_performance(self, model): - results = getattr(self, model) - return results.report_metrics(model) - - def update_baselines( - self, - true_entity, - ent_label, - random_candidate, - prior_candidate, - oracle_candidate, - ): - self.oracle.update_metrics(ent_label, true_entity, oracle_candidate) - self.prior.update_metrics(ent_label, true_entity, prior_candidate) - self.random.update_metrics(ent_label, true_entity, random_candidate) - - -def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None): - counts = dict() - baseline_results = BaselineResults() - context_results = EvaluationResults() - combo_results = EvaluationResults() - - for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'): - if len(doc) > 0: - correct_ents = dict() - for entity, kb_dict in gold.links.items(): - start, end = entity - for gold_kb, value in kb_dict.items(): - if value: - # only evaluating on positive examples - offset = _offset(start, end) - correct_ents[offset] = gold_kb - - if baseline: - _add_baseline(baseline_results, counts, doc, correct_ents, kb) - - if context: - # using only context - el_pipe.cfg["incl_context"] = True - el_pipe.cfg["incl_prior"] = False - _add_eval_result(context_results, doc, correct_ents, el_pipe) - - # measuring combined accuracy (prior + context) - el_pipe.cfg["incl_context"] = True - el_pipe.cfg["incl_prior"] = True - _add_eval_result(combo_results, doc, correct_ents, el_pipe) - - if baseline: - logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())})) - logger.info(baseline_results.report_performance("random")) - logger.info(baseline_results.report_performance("prior")) - logger.info(baseline_results.report_performance("oracle")) - - if context: - logger.info(context_results.report_metrics("context only")) - logger.info(combo_results.report_metrics("context and prior")) - - -def _add_eval_result(results, doc, correct_ents, el_pipe): - """ - Evaluate the ent.kb_id_ annotations against the gold standard. - Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. - """ - try: - doc = el_pipe(doc) - for ent in doc.ents: - ent_label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_ents.get(offset, None) - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - pred_entity = ent.kb_id_ - results.update_metrics(ent_label, gold_entity, pred_entity) - - except Exception as e: - logging.error("Error assessing accuracy " + str(e)) - - -def _add_baseline(baseline_results, counts, doc, correct_ents, kb): - """ - Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound. - Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. - """ - for ent in doc.ents: - ent_label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_ents.get(offset, None) - - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - candidates = kb.get_candidates(ent.text) - oracle_candidate = "" - prior_candidate = "" - random_candidate = "" - if candidates: - scores = [] - - for c in candidates: - scores.append(c.prior_prob) - if c.entity_ == gold_entity: - oracle_candidate = c.entity_ - - best_index = scores.index(max(scores)) - prior_candidate = candidates[best_index].entity_ - random_candidate = random.choice(candidates).entity_ - - current_count = counts.get(ent_label, 0) - counts[ent_label] = current_count+1 - - baseline_results.update_baselines( - gold_entity, - ent_label, - random_candidate, - prior_candidate, - oracle_candidate, - ) - - -def _offset(start, end): - return "{}_{}".format(start, end) diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py deleted file mode 100644 index 7778fc701..000000000 --- a/bin/wiki_entity_linking/kb_creator.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import logging - -from spacy.kb import KnowledgeBase - -from bin.wiki_entity_linking.train_descriptions import EntityEncoder -from bin.wiki_entity_linking import wiki_io as io - - -logger = logging.getLogger(__name__) - - -def create_kb( - nlp, - max_entities_per_alias, - min_entity_freq, - min_occ, - entity_def_path, - entity_descr_path, - entity_alias_path, - entity_freq_path, - prior_prob_path, - entity_vector_length, -): - # Create the knowledge base from Wikidata entries - kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) - entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length) - _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path) - return kb - - -def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length): - # read the mappings from file - title_to_id = io.read_title_to_id(entity_def_path) - id_to_descr = io.read_id_to_descr(entity_descr_path) - - # check the length of the nlp vectors - if "vectors" in nlp.meta and nlp.vocab.vectors.size: - input_dim = nlp.vocab.vectors_length - logger.info("Loaded pretrained vectors of size %s" % input_dim) - else: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq)) - entity_frequencies = io.read_entity_to_count(entity_freq_path) - # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise - filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( - title_to_id, - id_to_descr, - entity_frequencies, - min_entity_freq - ) - logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys()))) - - logger.info("Training entity encoder") - encoder = EntityEncoder(nlp, input_dim, entity_vector_length) - encoder.train(description_list=description_list, to_print=True) - - logger.info("Getting entity embeddings") - embeddings = encoder.apply_encoder(description_list) - - logger.info("Adding {} entities".format(len(entity_list))) - kb.set_entities( - entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings - ) - return entity_list, filtered_title_to_id - - -def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path): - logger.info("Adding aliases from Wikipedia and Wikidata") - _add_aliases( - kb, - entity_list=entity_list, - title_to_id=filtered_title_to_id, - max_entities_per_alias=max_entities_per_alias, - min_occ=min_occ, - prior_prob_path=prior_prob_path, - ) - - -def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies, - min_entity_freq: int = 10): - filtered_title_to_id = dict() - entity_list = [] - description_list = [] - frequency_list = [] - for title, entity in title_to_id.items(): - freq = entity_frequencies.get(title, 0) - desc = id_to_descr.get(entity, None) - if desc and freq > min_entity_freq: - entity_list.append(entity) - description_list.append(desc) - frequency_list.append(freq) - filtered_title_to_id[title] = entity - return filtered_title_to_id, entity_list, description_list, frequency_list - - -def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path): - wp_titles = title_to_id.keys() - - # adding aliases with prior probabilities - # we can read this file sequentially, it's sorted by alias, and then by count - logger.info("Adding WP aliases") - with prior_prob_path.open("r", encoding="utf8") as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - previous_alias = None - total_count = 0 - counts = [] - entities = [] - while line: - splits = line.replace("\n", "").split(sep="|") - new_alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - if new_alias != previous_alias and previous_alias: - # done reading the previous alias --> output - if len(entities) > 0: - selected_entities = [] - prior_probs = [] - for ent_count, ent_string in zip(counts, entities): - if ent_string in wp_titles: - wd_id = title_to_id[ent_string] - p_entity_givenalias = ent_count / total_count - selected_entities.append(wd_id) - prior_probs.append(p_entity_givenalias) - - if selected_entities: - try: - kb.add_alias( - alias=previous_alias, - entities=selected_entities, - probabilities=prior_probs, - ) - except ValueError as e: - logger.error(e) - total_count = 0 - counts = [] - entities = [] - - total_count += count - - if len(entities) < max_entities_per_alias and count >= min_occ: - counts.append(count) - entities.append(entity) - previous_alias = new_alias - - line = prior_file.readline() - - -def read_kb(nlp, kb_file): - kb = KnowledgeBase(vocab=nlp.vocab) - kb.load_bulk(kb_file) - return kb diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py deleted file mode 100644 index af08d6b8f..000000000 --- a/bin/wiki_entity_linking/train_descriptions.py +++ /dev/null @@ -1,152 +0,0 @@ -# coding: utf-8 -from random import shuffle - -import logging -import numpy as np - -from spacy._ml import zero_init, create_default_optimizer -from spacy.cli.pretrain import get_cossim_loss - -from thinc.v2v import Model -from thinc.api import chain -from thinc.neural._classes.affine import Affine - -logger = logging.getLogger(__name__) - - -class EntityEncoder: - """ - Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D). - This entity vector will be stored in the KB, for further downstream use in the entity model. - """ - - DROP = 0 - BATCH_SIZE = 1000 - - # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy - MIN_LOSS = 0.01 - - # Reasonable default to stop training when things are not improving - MAX_NO_IMPROVEMENT = 20 - - def __init__(self, nlp, input_dim, desc_width, epochs=5): - self.nlp = nlp - self.input_dim = input_dim - self.desc_width = desc_width - self.epochs = epochs - - def apply_encoder(self, description_list): - if self.encoder is None: - raise ValueError("Can not apply encoder before training it") - - batch_size = 100000 - - start = 0 - stop = min(batch_size, len(description_list)) - encodings = [] - - while start < len(description_list): - docs = list(self.nlp.pipe(description_list[start:stop])) - doc_embeddings = [self._get_doc_embedding(doc) for doc in docs] - enc = self.encoder(np.asarray(doc_embeddings)) - encodings.extend(enc.tolist()) - - start = start + batch_size - stop = min(stop + batch_size, len(description_list)) - logger.info("Encoded: {} entities".format(stop)) - - return encodings - - def train(self, description_list, to_print=False): - processed, loss = self._train_model(description_list) - if to_print: - logger.info( - "Trained entity descriptions on {} ".format(processed) + - "(non-unique) descriptions across {} ".format(self.epochs) + - "epochs" - ) - logger.info("Final loss: {}".format(loss)) - - def _train_model(self, description_list): - best_loss = 1.0 - iter_since_best = 0 - self._build_network(self.input_dim, self.desc_width) - - processed = 0 - loss = 1 - # copy this list so that shuffling does not affect other functions - descriptions = description_list.copy() - to_continue = True - - for i in range(self.epochs): - shuffle(descriptions) - - batch_nr = 0 - start = 0 - stop = min(self.BATCH_SIZE, len(descriptions)) - - while to_continue and start < len(descriptions): - batch = [] - for descr in descriptions[start:stop]: - doc = self.nlp(descr) - doc_vector = self._get_doc_embedding(doc) - batch.append(doc_vector) - - loss = self._update(batch) - if batch_nr % 25 == 0: - logger.info("loss: {} ".format(loss)) - processed += len(batch) - - # in general, continue training if we haven't reached our ideal min yet - to_continue = loss > self.MIN_LOSS - - # store the best loss and track how long it's been - if loss < best_loss: - best_loss = loss - iter_since_best = 0 - else: - iter_since_best += 1 - - # stop learning if we haven't seen improvement since the last few iterations - if iter_since_best > self.MAX_NO_IMPROVEMENT: - to_continue = False - - batch_nr += 1 - start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(descriptions)) - - return processed, loss - - @staticmethod - def _get_doc_embedding(doc): - indices = np.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in doc.vocab.vectors.key2row: - indices[i] = doc.vocab.vectors.key2row[word.orth] - else: - indices[i] = 0 - word_vectors = doc.vocab.vectors.data[indices] - doc_vector = np.mean(word_vectors, axis=0) - return doc_vector - - def _build_network(self, orig_width, hidden_with): - with Model.define_operators({">>": chain}): - # very simple encoder-decoder model - self.encoder = Affine(hidden_with, orig_width) - self.model = self.encoder >> zero_init( - Affine(orig_width, hidden_with, drop_factor=0.0) - ) - self.sgd = create_default_optimizer(self.model.ops) - - def _update(self, vectors): - predictions, bp_model = self.model.begin_update( - np.asarray(vectors), drop=self.DROP - ) - loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) - bp_model(d_scores, sgd=self.sgd) - return loss / len(vectors) - - @staticmethod - def _get_loss(golds, scores): - loss, gradients = get_cossim_loss(scores, golds) - return loss, gradients diff --git a/bin/wiki_entity_linking/wiki_io.py b/bin/wiki_entity_linking/wiki_io.py deleted file mode 100644 index 43ae87f0f..000000000 --- a/bin/wiki_entity_linking/wiki_io.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import sys -import csv - -# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/ -csv.field_size_limit(min(sys.maxsize, 2147483646)) - -""" This class provides reading/writing methods for temp files """ - - -# Entity definition: WP title -> WD ID # -def write_title_to_id(entity_def_output, title_to_id): - with entity_def_output.open("w", encoding="utf8") as id_file: - id_file.write("WP_title" + "|" + "WD_id" + "\n") - for title, qid in title_to_id.items(): - id_file.write(title + "|" + str(qid) + "\n") - - -def read_title_to_id(entity_def_output): - title_to_id = dict() - with entity_def_output.open("r", encoding="utf8") as id_file: - csvreader = csv.reader(id_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - title_to_id[row[0]] = row[1] - return title_to_id - - -# Entity aliases from WD: WD ID -> WD alias # -def write_id_to_alias(entity_alias_path, id_to_alias): - with entity_alias_path.open("w", encoding="utf8") as alias_file: - alias_file.write("WD_id" + "|" + "alias" + "\n") - for qid, alias_list in id_to_alias.items(): - for alias in alias_list: - alias_file.write(str(qid) + "|" + alias + "\n") - - -def read_id_to_alias(entity_alias_path): - id_to_alias = dict() - with entity_alias_path.open("r", encoding="utf8") as alias_file: - csvreader = csv.reader(alias_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - qid = row[0] - alias = row[1] - alias_list = id_to_alias.get(qid, []) - alias_list.append(alias) - id_to_alias[qid] = alias_list - return id_to_alias - - -def read_alias_to_id_generator(entity_alias_path): - """ Read (aliases, qid) tuples """ - - with entity_alias_path.open("r", encoding="utf8") as alias_file: - csvreader = csv.reader(alias_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - qid = row[0] - alias = row[1] - yield alias, qid - - -# Entity descriptions from WD: WD ID -> WD alias # -def write_id_to_descr(entity_descr_output, id_to_descr): - with entity_descr_output.open("w", encoding="utf8") as descr_file: - descr_file.write("WD_id" + "|" + "description" + "\n") - for qid, descr in id_to_descr.items(): - descr_file.write(str(qid) + "|" + descr + "\n") - - -def read_id_to_descr(entity_desc_path): - id_to_desc = dict() - with entity_desc_path.open("r", encoding="utf8") as descr_file: - csvreader = csv.reader(descr_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - id_to_desc[row[0]] = row[1] - return id_to_desc - - -# Entity counts from WP: WP title -> count # -def write_entity_to_count(prior_prob_input, count_output): - # Write entity counts for quick access later - entity_to_count = dict() - total_count = 0 - - with prior_prob_input.open("r", encoding="utf8") as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - - while line: - splits = line.replace("\n", "").split(sep="|") - # alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - current_count = entity_to_count.get(entity, 0) - entity_to_count[entity] = current_count + count - - total_count += count - - line = prior_file.readline() - - with count_output.open("w", encoding="utf8") as entity_file: - entity_file.write("entity" + "|" + "count" + "\n") - for entity, count in entity_to_count.items(): - entity_file.write(entity + "|" + str(count) + "\n") - - -def read_entity_to_count(count_input): - entity_to_count = dict() - with count_input.open("r", encoding="utf8") as csvfile: - csvreader = csv.reader(csvfile, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - entity_to_count[row[0]] = int(row[1]) - - return entity_to_count diff --git a/bin/wiki_entity_linking/wiki_namespaces.py b/bin/wiki_entity_linking/wiki_namespaces.py deleted file mode 100644 index e8f099ccd..000000000 --- a/bin/wiki_entity_linking/wiki_namespaces.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# List of meta pages in Wikidata, should be kept out of the Knowledge base -WD_META_ITEMS = [ - "Q163875", - "Q191780", - "Q224414", - "Q4167836", - "Q4167410", - "Q4663903", - "Q11266439", - "Q13406463", - "Q15407973", - "Q18616576", - "Q19887878", - "Q22808320", - "Q23894233", - "Q33120876", - "Q42104522", - "Q47460393", - "Q64875536", - "Q66480449", -] - - -# TODO: add more cases from non-English WP's - -# List of prefixes that refer to Wikipedia "file" pages -WP_FILE_NAMESPACE = ["Bestand", "File"] - -# List of prefixes that refer to Wikipedia "category" pages -WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"] - -# List of prefixes that refer to Wikipedia "meta" pages -# these will/should be matched ignoring case -WP_META_NAMESPACE = ( - WP_FILE_NAMESPACE - + WP_CATEGORY_NAMESPACE - + [ - "b", - "betawikiversity", - "Book", - "c", - "Commons", - "d", - "dbdump", - "download", - "Draft", - "Education", - "Foundation", - "Gadget", - "Gadget definition", - "Gebruiker", - "gerrit", - "Help", - "Image", - "Incubator", - "m", - "mail", - "mailarchive", - "media", - "MediaWiki", - "MediaWiki talk", - "Mediawikiwiki", - "MediaZilla", - "Meta", - "Metawikipedia", - "Module", - "mw", - "n", - "nost", - "oldwikisource", - "otrs", - "OTRSwiki", - "Overleg gebruiker", - "outreach", - "outreachwiki", - "Portal", - "phab", - "Phabricator", - "Project", - "q", - "quality", - "rev", - "s", - "spcom", - "Special", - "species", - "Strategy", - "sulutil", - "svn", - "Talk", - "Template", - "Template talk", - "Testwiki", - "ticket", - "TimedText", - "Toollabs", - "tools", - "tswiki", - "User", - "User talk", - "v", - "voy", - "w", - "Wikibooks", - "Wikidata", - "wikiHow", - "Wikinvest", - "wikilivres", - "Wikimedia", - "Wikinews", - "Wikipedia", - "Wikipedia talk", - "Wikiquote", - "Wikisource", - "Wikispecies", - "Wikitech", - "Wikiversity", - "Wikivoyage", - "wikt", - "wiktionary", - "wmf", - "wmania", - "WP", - ] -) diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py deleted file mode 100644 index 003074feb..000000000 --- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py +++ /dev/null @@ -1,179 +0,0 @@ -# coding: utf-8 -"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB) -with specific parameters. Intermediate files are written to disk. - -Running the full pipeline on a standard laptop, may take up to 13 hours of processing. -Use the -p, -d and -s options to speed up processing using the intermediate files -from a previous run. - -For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ -For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2 -from https://dumps.wikimedia.org/enwiki/latest/ - -""" -from __future__ import unicode_literals - -import logging -from pathlib import Path -import plac - -from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd -from bin.wiki_entity_linking import wiki_io as io -from bin.wiki_entity_linking import kb_creator -from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT -from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH -import spacy -from bin.wiki_entity_linking.kb_creator import read_kb - -logger = logging.getLogger(__name__) - - -@plac.annotations( - wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path), - wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path), - output_dir=("Output directory", "positional", None, Path), - model=("Model name or path, should include pretrained vectors.", "positional", None, str), - max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int), - min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int), - min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int), - entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int), - loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path), - loc_entity_defs=("Location to file with entity definitions", "option", "d", Path), - loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path), - descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"), - limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int), - limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int), - limit_wd=("Threshold to limit lines read from WD", "option", "lw", int), - lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str), -) -def main( - wd_json, - wp_xml, - output_dir, - model, - max_per_alias=10, - min_freq=20, - min_pair=5, - entity_vector_length=64, - loc_prior_prob=None, - loc_entity_defs=None, - loc_entity_alias=None, - loc_entity_desc=None, - descr_from_wp=False, - limit_prior=None, - limit_train=None, - limit_wd=None, - lang="en", -): - entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH - entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH - entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH - entity_freq_path = output_dir / ENTITY_FREQ_PATH - prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH - training_entities_path = output_dir / TRAINING_DATA_FILE - kb_path = output_dir / KB_FILE - - logger.info("Creating KB with Wikipedia and WikiData") - - # STEP 0: set up IO - if not output_dir.exists(): - output_dir.mkdir(parents=True) - - # STEP 1: Load the NLP object - logger.info("STEP 1: Loading NLP model {}".format(model)) - nlp = spacy.load(model) - - # check the length of the nlp vectors - if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - # STEP 2: create prior probabilities from WP - if not prior_prob_path.exists(): - # It takes about 2h to process 1000M lines of Wikipedia XML dump - logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path)) - if limit_prior is not None: - logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior)) - wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior) - else: - logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path)) - - # STEP 3: calculate entity frequencies - if not entity_freq_path.exists(): - logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path)) - io.write_entity_to_count(prior_prob_path, entity_freq_path) - else: - logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path)) - - # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file - if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()): - # It takes about 10h to process 55M lines of Wikidata JSON dump - logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path)) - if limit_wd is not None: - logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd)) - title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json( - wd_json, - limit_wd, - to_print=False, - lang=lang, - parse_descr=(not descr_from_wp), - ) - io.write_title_to_id(entity_defs_path, title_to_id) - - logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path)) - io.write_id_to_alias(entity_alias_path, id_to_alias) - - if not descr_from_wp: - logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path)) - io.write_id_to_descr(entity_descr_path, id_to_descr) - else: - logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path)) - logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path)) - if not descr_from_wp: - logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path)) - - # STEP 5: Getting gold entities from Wikipedia - if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()): - logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path)) - if limit_train is not None: - logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train)) - wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path, - training_entities_path, descr_from_wp, limit_train) - if descr_from_wp: - logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path)) - else: - logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path)) - if descr_from_wp: - logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path)) - - # STEP 6: creating the actual KB - # It takes ca. 30 minutes to pretrain the entity embeddings - if not kb_path.exists(): - logger.info("STEP 6: Creating the KB at {}".format(kb_path)) - kb = kb_creator.create_kb( - nlp=nlp, - max_entities_per_alias=max_per_alias, - min_entity_freq=min_freq, - min_occ=min_pair, - entity_def_path=entity_defs_path, - entity_descr_path=entity_descr_path, - entity_alias_path=entity_alias_path, - entity_freq_path=entity_freq_path, - prior_prob_path=prior_prob_path, - entity_vector_length=entity_vector_length, - ) - kb.dump(kb_path) - logger.info("kb entities: {}".format(kb.get_size_entities())) - logger.info("kb aliases: {}".format(kb.get_size_aliases())) - nlp.to_disk(output_dir / KB_MODEL_DIR) - else: - logger.info("STEP 6: KB already exists at {}".format(kb_path)) - - logger.info("Done!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) - plac.call(main) diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py deleted file mode 100644 index 8a070f567..000000000 --- a/bin/wiki_entity_linking/wikidata_processor.py +++ /dev/null @@ -1,154 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import bz2 -import json -import logging - -from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS - -logger = logging.getLogger(__name__) - - -def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True): - # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines. - # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ - - site_filter = '{}wiki'.format(lang) - - # filter: currently defined as OR: one hit suffices to be removed from further processing - exclude_list = WD_META_ITEMS - - # punctuation - exclude_list.extend(["Q1383557", "Q10617810"]) - - # letters etc - exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"]) - - neg_prop_filter = { - 'P31': exclude_list, # instance of - 'P279': exclude_list # subclass - } - - title_to_id = dict() - id_to_descr = dict() - id_to_alias = dict() - - # parse appropriate fields - depending on what we need in the KB - parse_properties = False - parse_sitelinks = True - parse_labels = False - parse_aliases = True - parse_claims = True - - with bz2.open(wikidata_file, mode='rb') as file: - for cnt, line in enumerate(file): - if limit and cnt >= limit: - break - if cnt % 500000 == 0 and cnt > 0: - logger.info("processed {} lines of WikiData JSON dump".format(cnt)) - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - obj = json.loads(clean_line) - entry_type = obj["type"] - - if entry_type == "item": - keep = True - - claims = obj["claims"] - if parse_claims: - for prop, value_set in neg_prop_filter.items(): - claim_property = claims.get(prop, None) - if claim_property: - for cp in claim_property: - cp_id = ( - cp["mainsnak"] - .get("datavalue", {}) - .get("value", {}) - .get("id") - ) - cp_rank = cp["rank"] - if cp_rank != "deprecated" and cp_id in value_set: - keep = False - - if keep: - unique_id = obj["id"] - - if to_print: - print("ID:", unique_id) - print("type:", entry_type) - - # parsing all properties that refer to other entities - if parse_properties: - for prop, claim_property in claims.items(): - cp_dicts = [ - cp["mainsnak"]["datavalue"].get("value") - for cp in claim_property - if cp["mainsnak"].get("datavalue") - ] - cp_values = [ - cp_dict.get("id") - for cp_dict in cp_dicts - if isinstance(cp_dict, dict) - if cp_dict.get("id") is not None - ] - if cp_values: - if to_print: - print("prop:", prop, cp_values) - - found_link = False - if parse_sitelinks: - site_value = obj["sitelinks"].get(site_filter, None) - if site_value: - site = site_value["title"] - if to_print: - print(site_filter, ":", site) - title_to_id[site] = unique_id - found_link = True - - if parse_labels: - labels = obj["labels"] - if labels: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print( - "label (" + lang + "):", lang_label["value"] - ) - - if found_link and parse_descr: - descriptions = obj["descriptions"] - if descriptions: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print( - "description (" + lang + "):", - lang_descr["value"], - ) - id_to_descr[unique_id] = lang_descr["value"] - - if parse_aliases: - aliases = obj["aliases"] - if aliases: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print( - "alias (" + lang + "):", item["value"] - ) - alias_list = id_to_alias.get(unique_id, []) - alias_list.append(item["value"]) - id_to_alias[unique_id] = alias_list - - if to_print: - print() - - # log final number of lines processed - logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt)) - return title_to_id, id_to_descr, id_to_alias - - diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py deleted file mode 100644 index 54f00fc6f..000000000 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ /dev/null @@ -1,172 +0,0 @@ -# coding: utf-8 -"""Script that takes a previously created Knowledge Base and trains an entity linking -pipeline. The provided KB directory should hold the kb, the original nlp object and -its vocab used to create the KB, and a few auxiliary files such as the entity definitions, -as created by the script `wikidata_create_kb`. - -For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2 -from https://dumps.wikimedia.org/enwiki/latest/ -""" -from __future__ import unicode_literals - -import random -import logging -import spacy -from pathlib import Path -import plac -from tqdm import tqdm - -from bin.wiki_entity_linking import wikipedia_processor -from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR -from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance -from bin.wiki_entity_linking.kb_creator import read_kb - -from spacy.util import minibatch, compounding - -logger = logging.getLogger(__name__) - - -@plac.annotations( - dir_kb=("Directory with KB, NLP and related files", "positional", None, Path), - output_dir=("Output directory", "option", "o", Path), - loc_training=("Location to training data", "option", "k", Path), - epochs=("Number of training iterations (default 10)", "option", "e", int), - dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float), - lr=("Learning rate (default 0.005)", "option", "n", float), - l2=("L2 regularization", "option", "r", float), - train_articles=("# training articles (default 90% of all)", "option", "t", int), - dev_articles=("# dev test articles (default 10% of all)", "option", "d", int), - labels_discard=("NER labels to discard (default None)", "option", "l", str), -) -def main( - dir_kb, - output_dir=None, - loc_training=None, - epochs=10, - dropout=0.5, - lr=0.005, - l2=1e-6, - train_articles=None, - dev_articles=None, - labels_discard=None -): - if not output_dir: - logger.warning("No output dir specified so no results will be written, are you sure about this ?") - - logger.info("Creating Entity Linker with Wikipedia and WikiData") - - output_dir = Path(output_dir) if output_dir else dir_kb - training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE - nlp_dir = dir_kb / KB_MODEL_DIR - kb_path = dir_kb / KB_FILE - nlp_output_dir = output_dir / OUTPUT_MODEL_DIR - - # STEP 0: set up IO - if not output_dir.exists(): - output_dir.mkdir() - - # STEP 1 : load the NLP object - logger.info("STEP 1a: Loading model from {}".format(nlp_dir)) - nlp = spacy.load(nlp_dir) - logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names)) - - # check that there is a NER component in the pipeline - if "ner" not in nlp.pipe_names: - raise ValueError("The `nlp` object should have a pretrained `ner` component.") - - logger.info("STEP 1b: Loading KB from {}".format(kb_path)) - kb = read_kb(nlp, kb_path) - - # STEP 2: read the training dataset previously created from WP - logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path)) - train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path) - logger.info("Training set has {} articles, limit set to roughly {} articles per epoch" - .format(len(train_indices), train_articles if train_articles else "all")) - logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation" - .format(len(dev_indices), dev_articles if dev_articles else "all")) - if dev_articles: - dev_indices = dev_indices[0:dev_articles] - - # STEP 3: create and train an entity linking pipe - logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs)) - if labels_discard: - labels_discard = [x.strip() for x in labels_discard.split(",")] - logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard)) - else: - labels_discard = [] - - el_pipe = nlp.create_pipe( - name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name, - "labels_discard": labels_discard} - ) - el_pipe.set_kb(kb) - nlp.add_pipe(el_pipe, last=True) - - other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] - with nlp.disable_pipes(*other_pipes): # only train Entity Linking - optimizer = nlp.begin_training() - optimizer.learn_rate = lr - optimizer.L2 = l2 - - logger.info("Dev Baseline Accuracies:") - dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=True, line_ids=dev_indices, - kb=kb, labels_discard=labels_discard) - - measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices)) - - for itn in range(epochs): - random.shuffle(train_indices) - losses = {} - batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001)) - batchnr = 0 - articles_processed = 0 - - # we either process the whole training file, or just a part each epoch - bar_total = len(train_indices) - if train_articles: - bar_total = train_articles - - with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar: - for batch in batches: - if not train_articles or articles_processed < train_articles: - with nlp.disable_pipes("entity_linker"): - train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=False, line_ids=batch, - kb=kb, labels_discard=labels_discard) - docs, golds = zip(*train_batch) - try: - with nlp.disable_pipes(*other_pipes): - nlp.update( - docs=docs, - golds=golds, - sgd=optimizer, - drop=dropout, - losses=losses, - ) - batchnr += 1 - articles_processed += len(docs) - pbar.update(len(docs)) - except Exception as e: - logger.error("Error updating batch:" + str(e)) - if batchnr > 0: - logging.info("Epoch {} trained on {} articles, train loss {}" - .format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2))) - # re-read the dev_data (data is returned as a generator) - dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=True, line_ids=dev_indices, - kb=kb, labels_discard=labels_discard) - measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices)) - - if output_dir: - # STEP 4: write the NLP pipeline (now including an EL model) to file - logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names)) - logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir)) - nlp.to_disk(nlp_output_dir) - - logger.info("Done!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) - plac.call(main) diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py deleted file mode 100644 index 649d48fe5..000000000 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ /dev/null @@ -1,565 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import bz2 -import logging -import random -import json - -from spacy.gold import GoldParse -from bin.wiki_entity_linking import wiki_io as io -from bin.wiki_entity_linking.wiki_namespaces import ( - WP_META_NAMESPACE, - WP_FILE_NAMESPACE, - WP_CATEGORY_NAMESPACE, -) - -""" -Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions. -Write these results to file for downstream KB and training data generation. - -Process Wikipedia interlinks to generate a training dataset for the EL algorithm. -""" - -ENTITY_FILE = "gold_entities.csv" - -map_alias_to_link = dict() - -logger = logging.getLogger(__name__) - -title_regex = re.compile(r"(?<=).*(?=)") -id_regex = re.compile(r"(?<=)\d*(?=)") -text_tag_regex = re.compile(r"(?<=)") -text_regex = re.compile(r"(?<=).*(?= 0: - logger.info("processed {} lines of Wikipedia XML dump".format(cnt)) - clean_line = line.strip().decode("utf-8") - - # we attempt at reading the article's ID (but not the revision or contributor ID) - if "" in clean_line or "" in clean_line: - read_id = False - if "" in clean_line: - read_id = True - - if read_id: - ids = id_regex.search(clean_line) - if ids: - current_article_id = ids[0] - - # only processing prior probabilities from true training (non-dev) articles - if not is_dev(current_article_id): - aliases, entities, normalizations = get_wp_links(clean_line) - for alias, entity, norm in zip(aliases, entities, normalizations): - _store_alias( - alias, entity, normalize_alias=norm, normalize_entity=True - ) - - line = file.readline() - cnt += 1 - logger.info("processed {} lines of Wikipedia XML dump".format(cnt)) - logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt)) - - # write all aliases and their entities and count occurrences to file - with prior_prob_output.open("w", encoding="utf8") as outputfile: - outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") - for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): - s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True) - for entity, count in s_dict: - outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") - - -def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): - alias = alias.strip() - entity = entity.strip() - - # remove everything after # as this is not part of the title but refers to a specific paragraph - if normalize_entity: - # wikipedia titles are always capitalized - entity = _capitalize_first(entity.split("#")[0]) - if normalize_alias: - alias = alias.split("#")[0] - - if alias and entity: - alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(entity, 0) - alias_dict[entity] = entity_count + 1 - map_alias_to_link[alias] = alias_dict - - -def get_wp_links(text): - aliases = [] - entities = [] - normalizations = [] - - matches = link_regex.findall(text) - for match in matches: - match = match[2:][:-2].replace("_", " ").strip() - - if ns_regex.match(match): - pass # ignore the entity if it points to a "meta" page - - # this is a simple [[link]], with the alias the same as the mention - elif "|" not in match: - aliases.append(match) - entities.append(match) - normalizations.append(True) - - # in wiki format, the link is written as [[entity|alias]] - else: - splits = match.split("|") - entity = splits[0].strip() - alias = splits[1].strip() - # specific wiki format [[alias (specification)|]] - if len(alias) == 0 and "(" in entity: - alias = entity.split("(")[0] - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - else: - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - - return aliases, entities, normalizations - - -def _capitalize_first(text): - if not text: - return None - result = text[0].capitalize() - if len(result) > 0: - result += text[1:] - return result - - -def create_training_and_desc( - wp_input, def_input, desc_output, training_output, parse_desc, limit=None -): - wp_to_id = io.read_title_to_id(def_input) - _process_wikipedia_texts( - wp_input, wp_to_id, desc_output, training_output, parse_desc, limit - ) - - -def _process_wikipedia_texts( - wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None -): - """ - Read the XML wikipedia data to parse out training data: - raw text data + positive instances - """ - - read_ids = set() - - with output.open("a", encoding="utf8") as descr_file, training_output.open( - "w", encoding="utf8" - ) as entity_file: - if parse_descriptions: - _write_training_description(descr_file, "WD_id", "description") - with bz2.open(wikipedia_input, mode="rb") as file: - article_count = 0 - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - for line in file: - clean_line = line.strip().decode("utf-8") - - if clean_line == "": - reading_revision = True - elif clean_line == "": - reading_revision = False - - # Start reading new page - if clean_line == "": - article_text = "" - article_title = None - article_id = None - # finished reading this page - elif clean_line == "": - if article_id: - clean_text, entities = _process_wp_text( - article_title, article_text, wp_to_id - ) - if clean_text is not None and entities is not None: - _write_training_entities( - entity_file, article_id, clean_text, entities - ) - - if article_title in wp_to_id and parse_descriptions: - description = " ".join( - clean_text[:1000].split(" ")[:-1] - ) - _write_training_description( - descr_file, wp_to_id[article_title], description - ) - article_count += 1 - if article_count % 10000 == 0 and article_count > 0: - logger.info( - "Processed {} articles".format(article_count) - ) - if limit and article_count >= limit: - break - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - # start reading text within a page - if "") - clean_text = clean_text.replace(r""", '"') - clean_text = clean_text.replace(r"&nbsp;", " ") - clean_text = clean_text.replace(r"&", "&") - - # remove multiple spaces - while " " in clean_text: - clean_text = clean_text.replace(" ", " ") - - return clean_text.strip() - - -def _remove_links(clean_text, wp_to_id): - # read the text char by char to get the right offsets for the interwiki links - entities = [] - final_text = "" - open_read = 0 - reading_text = True - reading_entity = False - reading_mention = False - reading_special_case = False - entity_buffer = "" - mention_buffer = "" - for index, letter in enumerate(clean_text): - if letter == "[": - open_read += 1 - elif letter == "]": - open_read -= 1 - elif letter == "|": - if reading_text: - final_text += letter - # switch from reading entity to mention in the [[entity|mention]] pattern - elif reading_entity: - reading_text = False - reading_entity = False - reading_mention = True - else: - reading_special_case = True - else: - if reading_entity: - entity_buffer += letter - elif reading_mention: - mention_buffer += letter - elif reading_text: - final_text += letter - else: - raise ValueError("Not sure at point", clean_text[index - 2 : index + 2]) - - if open_read > 2: - reading_special_case = True - - if open_read == 2 and reading_text: - reading_text = False - reading_entity = True - reading_mention = False - - # we just finished reading an entity - if open_read == 0 and not reading_text: - if "#" in entity_buffer or entity_buffer.startswith(":"): - reading_special_case = True - # Ignore cases with nested structures like File: handles etc - if not reading_special_case: - if not mention_buffer: - mention_buffer = entity_buffer - start = len(final_text) - end = start + len(mention_buffer) - qid = wp_to_id.get(entity_buffer, None) - if qid: - entities.append((mention_buffer, qid, start, end)) - final_text += mention_buffer - - entity_buffer = "" - mention_buffer = "" - - reading_text = True - reading_entity = False - reading_mention = False - reading_special_case = False - return final_text, entities - - -def _write_training_description(outputfile, qid, description): - if description is not None: - line = str(qid) + "|" + description + "\n" - outputfile.write(line) - - -def _write_training_entities(outputfile, article_id, clean_text, entities): - entities_data = [ - {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]} - for ent in entities - ] - line = ( - json.dumps( - { - "article_id": article_id, - "clean_text": clean_text, - "entities": entities_data, - }, - ensure_ascii=False, - ) - + "\n" - ) - outputfile.write(line) - - -def read_training_indices(entity_file_path): - """ This method creates two lists of indices into the training file: one with indices for the - training examples, and one for the dev examples.""" - train_indices = [] - dev_indices = [] - - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): - example = json.loads(line) - article_id = example["article_id"] - clean_text = example["clean_text"] - - if is_valid_article(clean_text): - if is_dev(article_id): - dev_indices.append(i) - else: - train_indices.append(i) - - return train_indices, dev_indices - - -def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None): - """ This method provides training/dev examples that correspond to the entity annotations found by the nlp object. - For training, it will include both positive and negative examples by using the candidate generator from the kb. - For testing (kb=None), it will include all positive examples only.""" - if not labels_discard: - labels_discard = [] - - max_index = max(line_ids) - - with entity_file_path.open("r", encoding="utf8") as _file: - line = _file.readline() - i = 0 - while line and i < max_index: - if i in line_ids: - example = json.loads(line) - article_id = example["article_id"] - clean_text = example["clean_text"] - entities = example["entities"] - - if dev != is_dev(article_id) or not is_valid_article(clean_text): - continue - - doc = nlp(clean_text) - gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) - if gold and len(gold.links) > 0: - yield doc, gold - i += 1 - line = _file.readline() - - -def _get_gold_parse(doc, entities, dev, kb, labels_discard): - gold_entities = {} - tagged_ent_positions = { - (ent.start_char, ent.end_char): ent - for ent in doc.ents - if ent.label_ not in labels_discard - } - - for entity in entities: - entity_id = entity["entity"] - alias = entity["alias"] - start = entity["start"] - end = entity["end"] - - candidate_ids = [] - if kb and not dev: - candidates = kb.get_candidates(alias) - candidate_ids = [cand.entity_ for cand in candidates] - - tagged_ent = tagged_ent_positions.get((start, end), None) - if tagged_ent: - # TODO: check that alias == doc.text[start:end] - should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence( - tagged_ent.sent.text - ) - - if should_add_ent: - value_by_id = {entity_id: 1.0} - if not dev: - random.shuffle(candidate_ids) - value_by_id.update( - {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id} - ) - gold_entities[(start, end)] = value_by_id - - return GoldParse(doc, links=gold_entities) - - -def is_dev(article_id): - if not article_id: - return False - return article_id.endswith("3") - - -def is_valid_article(doc_text): - # custom length cut-off - return 10 < len(doc_text) < 30000 - - -def is_valid_sentence(sent_text): - if not 10 < len(sent_text) < 3000: - # custom length cut-off - return False - - if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"): - # remove 'enumeration' sentences (occurs often on Wikipedia) - return False - - return True diff --git a/examples/training/pretrain_kb.py b/examples/training/create_kb.py similarity index 75% rename from examples/training/pretrain_kb.py rename to examples/training/create_kb.py index 54c68f653..cbdb5c05b 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/create_kb.py @@ -1,15 +1,15 @@ #!/usr/bin/env python # coding: utf8 -"""Example of defining and (pre)training spaCy's knowledge base, +"""Example of defining a knowledge base in spaCy, which is needed to implement entity linking functionality. For more details, see the documentation: * Knowledge base: https://spacy.io/api/kb * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2.3 -Last tested with: v2.2.3 +Compatible with: spaCy v2.2.4 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function @@ -20,24 +20,18 @@ from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase -from bin.wiki_entity_linking.train_descriptions import EntityEncoder - # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} -INPUT_DIM = 300 # dimension of pretrained input vectors -DESC_WIDTH = 64 # dimension of output entity vectors - @plac.annotations( model=("Model name, should have pretrained word embeddings", "positional", None, str), output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), ) -def main(model=None, output_dir=None, n_iter=50): - """Load the model, create the KB and pretrain the entity encodings. +def main(model=None, output_dir=None): + """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" @@ -51,33 +45,23 @@ def main(model=None, output_dir=None, n_iter=50): " cf. https://spacy.io/usage/models#languages." ) - kb = KnowledgeBase(vocab=nlp.vocab) + # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. + # For simplicity, we'll just use the original vector dimension here instead. + vectors_dim = nlp.vocab.vectors.shape[1] + kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] - descriptions = [] + descr_embeddings = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) - descriptions.append(desc) + descr_embeddings.append(nlp(desc).vector) freqs.append(freq) - # training entity description encodings - # this part can easily be replaced with a custom entity encoder - encoder = EntityEncoder( - nlp=nlp, - input_dim=INPUT_DIM, - desc_width=DESC_WIDTH, - epochs=n_iter, - ) - encoder.train(description_list=descriptions, to_print=True) - - # get the pretrained entity vectors - embeddings = encoder.apply_encoder(descriptions) - # set the entities, can also be done by calling `kb.add_entity` for each entity - kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) + kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( @@ -113,8 +97,8 @@ def main(model=None, output_dir=None, n_iter=50): vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) - _print_kb(kb2) print() + _print_kb(kb2) def _print_kb(kb): @@ -126,6 +110,5 @@ if __name__ == "__main__": plac.call(main) # Expected output: - # 2 kb entities: ['Q2146908', 'Q7381115'] # 1 kb aliases: ['Russ Cochran'] diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index dd7c3a1b2..c7eba8a30 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -1,15 +1,15 @@ #!/usr/bin/env python # coding: utf8 -"""Example of training spaCy's entity linker, starting off with an -existing model and a pre-defined knowledge base. +"""Example of training spaCy's entity linker, starting off with a predefined +knowledge base and corresponding vocab, and a blank English model. For more details, see the documentation: * Training: https://spacy.io/usage/training * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2.3 -Last tested with: v2.2.3 +Compatible with: spaCy v2.2.4 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function @@ -17,13 +17,11 @@ import plac import random from pathlib import Path -from spacy.symbols import PERSON from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase from spacy.pipeline import EntityRuler -from spacy.tokens import Span from spacy.util import minibatch, compounding diff --git a/spacy/tests/regression/test_issue5314.py b/spacy/tests/regression/test_issue5314.py deleted file mode 100644 index 5bb817d5c..000000000 --- a/spacy/tests/regression/test_issue5314.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest - -from bin.wiki_entity_linking.wikipedia_processor import _process_wp_text - -old_format_text = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" -new_format_text = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" -potential_future_format = """[[Fil:Archäologie schichtengrabung.jpg|thumb|Arkæologisk [[udgravning]] med profil.]] '''Arkæologi''' er studiet af tidligere tiders [[menneske]]lige [[aktivitet]], primært gennem studiet af menneskets materielle levn.""" - - -@pytest.mark.parametrize( - "text", [old_format_text, new_format_text, potential_future_format] -) -def test_issue5314(text): - title = "Arkæologi" - clean_text, _ = _process_wp_text(title, text, {}) - - expected_text = "Arkæologi er studiet af tidligere tiders menneskelige aktivitet, primært gennem studiet af menneskets materielle levn." - assert clean_text.strip() == expected_text diff --git a/website/docs/usage/examples.md b/website/docs/usage/examples.md index 96dc7627d..854b2d42b 100644 --- a/website/docs/usage/examples.md +++ b/website/docs/usage/examples.md @@ -111,6 +111,27 @@ start. https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py ``` +### Creating a Knowledge Base for Named Entity Linking {#kb} + +This example shows how to create a knowledge base in spaCy, +which is needed to implement entity linking functionality. +It requires as input a spaCy model with pretrained word vectors, +and it stores the KB to file (if an `output_dir` is provided). + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py +``` + +### Training spaCy's Named Entity Linker {#nel} + +This example shows how to train spaCy's entity linker with your own custom +examples, starting off with a predefined knowledge base and its vocab, +and using a blank `English` class. + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py +``` + ### Training spaCy's Dependency Parser {#parser} This example shows how to update spaCy's dependency parser, starting off with an diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 59712939a..d17e5a661 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -579,9 +579,7 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' To ground the named entities into the "real world", spaCy provides functionality to perform entity linking, which resolves a textual entity to a unique -identifier from a knowledge base (KB). The -[processing scripts](https://github.com/explosion/spaCy/tree/master/bin/wiki_entity_linking) -we provide use WikiData identifiers, but you can create your own +identifier from a knowledge base (KB). You can create your own [`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using that custom-made KB. diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 479441edf..ecdc6720b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -347,9 +347,9 @@ your data** to find a solution that works best for you. ### Updating the Named Entity Recognizer {#example-train-ner} This example shows how to update spaCy's entity recognizer with your own -examples, starting off with an existing, pretrained model, or from scratch -using a blank `Language` class. To do this, you'll need **example texts** and -the **character offsets** and **labels** of each entity contained in the texts. +examples, starting off with an existing, pretrained model, or from scratch using +a blank `Language` class. To do this, you'll need **example texts** and the +**character offsets** and **labels** of each entity contained in the texts. ```python https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py @@ -440,8 +440,8 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py training the parser. 2. **Add the dependency labels** to the parser using the [`add_label`](/api/dependencyparser#add_label) method. If you're starting off - with a pretrained spaCy model, this is usually not necessary – but it - doesn't hurt either, just to be safe. + with a pretrained spaCy model, this is usually not necessary – but it doesn't + hurt either, just to be safe. 3. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through the words of the input. At each word, it makes a **prediction**. It then @@ -605,16 +605,16 @@ To train an entity linking model, you first need to define a knowledge base A KB consists of a list of entities with unique identifiers. Each such entity has an entity vector that will be used to measure similarity with the context in -which an entity is used. These vectors are pretrained and stored in the KB -before the entity linking model will be trained. +which an entity is used. These vectors have a fixed length and are stored in the +KB. The following example shows how to build a knowledge base from scratch, given a -list of entities and potential aliases. The script further demonstrates how to -pretrain and store the entity vectors. To run this example, the script needs -access to a `vocab` instance or an `nlp` model with pretrained word embeddings. +list of entities and potential aliases. The script requires an `nlp` model with +pretrained word vectors to obtain an encoding of an entity's description as its +vector. ```python -https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py +https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py ``` #### Step by step guide {#step-by-step-kb} From 63885c1836c219745e2fccc8ecacd2f357aa0341 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Apr 2020 12:54:57 +0200 Subject: [PATCH 098/105] Remove u string and auto-format [ci skip] --- website/meta/universe.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index f9638279c..50977b39c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -364,7 +364,7 @@ "entity = Entity(keywords_list=['python', 'java platform'])", "nlp.add_pipe(entity, last=True)", "", - "doc = nlp(u\"I am a product manager for a java and python.\")", + "doc = nlp(\"I am a product manager for a java and python.\")", "assert doc._.has_entities == True", "assert doc[2:5]._.has_entities == True", "assert doc[0]._.is_entity == False", @@ -1653,10 +1653,10 @@ "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy.", "thumb": "https://i.imgur.com/ggVI02O.jpg", "image": "https://i.imgur.com/z1yhWQR.jpg", - "url": "https://telegram.me/pic2phrase_bot", + "url": "https://telegram.me/pic2phrase_bot", "author": "Yuli Vasiliev", "author_links": { - "twitter": "VasilievYuli", + "twitter": "VasilievYuli" }, "category": ["standalone", "conversational"] }, From cfdaf99b8029d6762730c5d5bd2b6f6c173c1241 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Apr 2020 12:56:17 +0200 Subject: [PATCH 099/105] Fix passing of component configuration (#5374) * add kwargs to to_disk methods in docs - otherwise crashes on 'exclude' argument * add fix and test for Issue 5137 --- spacy/tests/regression/test_issue5137.py | 33 ++++++++++++++++++++++++ spacy/util.py | 1 + website/docs/usage/saving-loading.md | 4 +-- 3 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/regression/test_issue5137.py diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py new file mode 100644 index 000000000..4b4e597d3 --- /dev/null +++ b/spacy/tests/regression/test_issue5137.py @@ -0,0 +1,33 @@ +import spacy +from spacy.language import Language +from spacy.lang.en import English +from spacy.tests.util import make_tempdir + + +def test_issue5137(): + class MyComponent(object): + name = "my_component" + + def __init__(self, nlp, **cfg): + self.nlp = nlp + self.categories = cfg.get("categories", "all_categories") + + def __call__(self, doc): + pass + + def to_disk(self, path, **kwargs): + pass + + def from_disk(self, path, **cfg): + pass + + Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg) + + nlp = English() + nlp.add_pipe(nlp.create_pipe("my_component")) + assert nlp.get_pipe("my_component").categories == "all_categories" + + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + nlp2 = spacy.load(tmpdir, categories="my_categories") + assert nlp2.get_pipe("my_component").categories == "my_categories" diff --git a/spacy/util.py b/spacy/util.py index 609c0b572..d4cdca4e0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -208,6 +208,7 @@ def load_model_from_path(model_path, meta=False, **overrides): for name in pipeline: if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) + config.update(overrides) factory = factories.get(name, name) component = nlp.create_pipe(factory, config=config) nlp.add_pipe(component, name=name) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 8e2c30d82..76a9773f6 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -216,7 +216,7 @@ class CustomComponent(object): # Add something to the component's data self.data.append(data) - def to_disk(self, path): + def to_disk(self, path, **kwargs): # This will receive the directory path + /my_component data_path = path / "data.json" with data_path.open("w", encoding="utf8") as f: @@ -461,7 +461,7 @@ model. When you save out a model using `nlp.to_disk` and the component exposes a `to_disk` method, it will be called with the disk path. ```python -def to_disk(self, path): +def to_disk(self, path, **kwargs): snek_path = path / "snek.txt" with snek_path.open("w", encoding="utf8") as snek_file: snek_file.write(self.snek) From bdff76deded8380c68b8cc209f60c1dea3034cf3 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:56:46 +0200 Subject: [PATCH 100/105] Various updates/additions to CLI scripts (#5362) * `debug-data`: determine coverage of provided vectors * `evaluate`: support `blank:lg` model to make it possible to just evaluate tokenization * `init-model`: add option to truncate vectors to N most frequent vectors from word2vec file * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status --- spacy/cli/debug_data.py | 31 ++++++++++++++++++++++++++----- spacy/cli/evaluate.py | 6 +++++- spacy/cli/init_model.py | 19 +++++++++++++++---- spacy/cli/train.py | 36 ++++++++++++++++++++---------------- website/docs/api/cli.md | 3 ++- 5 files changed, 68 insertions(+), 27 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c5e1ff6cf..279f34f16 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -108,9 +108,11 @@ def debug_data( msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly - gold_train_data = _compile_gold(train_docs, pipeline) - gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) - gold_dev_data = _compile_gold(dev_docs, pipeline) + gold_train_data = _compile_gold(train_docs, pipeline, nlp) + gold_train_unpreprocessed_data = _compile_gold( + train_docs_unpreprocessed, pipeline, nlp + ) + gold_dev_data = _compile_gold(dev_docs, pipeline, nlp) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] @@ -182,6 +184,16 @@ def debug_data( nlp.vocab.vectors_length, ) ) + n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) + msg.warn( + "{} words in training data without vectors ({:0.2f}%)".format( + n_missing_vectors, + n_missing_vectors / gold_train_data["n_words"], + ), + ) + msg.text( + "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose, + ) else: msg.info("No word vectors present in the model") @@ -562,7 +574,7 @@ def _load_file(file_path, msg): ) -def _compile_gold(train_docs, pipeline): +def _compile_gold(train_docs, pipeline, nlp): data = { "ner": Counter(), "cats": Counter(), @@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline): "punct_ents": 0, "n_words": 0, "n_misaligned_words": 0, + "words_missing_vectors": Counter(), "n_sents": 0, "n_nonproj": 0, "n_cycles": 0, @@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline): data["n_words"] += len(valid_words) data["n_misaligned_words"] += len(gold.words) - len(valid_words) data["texts"].add(doc.text) + if len(nlp.vocab.vectors): + for word in valid_words: + if nlp.vocab.strings[word] not in nlp.vocab.vectors: + data["words_missing_vectors"].update([word]) if "ner" in pipeline: for i, label in enumerate(gold.ner): if label is None: @@ -636,7 +653,11 @@ def _format_labels(labels, counts=False): def _get_examples_without_label(data, label): count = 0 for doc, gold in data: - labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")] + labels = [ + label.split("-")[1] + for label in gold.ner + if label is not None and label not in ("O", "-") + ] if label not in labels: count += 1 return count diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index c24e37038..8a84684e5 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, division, print_function import plac +import spacy from timeit import default_timer as timer from wasabi import msg @@ -43,7 +44,10 @@ def evaluate( if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) - nlp = util.load_model(model) + if model.startswith("blank:"): + nlp = spacy.blank(model.replace("blank:", "")) + else: + nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_docs, verbose=False) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 2e0aeb239..31d627e9b 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20 jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path), clusters_loc=("Optional location of brown clusters data", "option", "c", str), vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), + truncate_vectors=( + "Optional number of vectors to truncate to when reading in vectors file", + "option", + "t", + int, + ), prune_vectors=("Optional number of vectors to prune to", "option", "V", int), vectors_name=( "Optional name for the word vectors, e.g. en_core_web_lg.vectors", @@ -51,6 +57,7 @@ def init_model( clusters_loc=None, jsonl_loc=None, vectors_loc=None, + truncate_vectors=0, prune_vectors=-1, vectors_name=None, model_name=None, @@ -88,7 +95,7 @@ def init_model( nlp = create_model(lang, lex_attrs, name=model_name) msg.good("Successfully created model") if vectors_loc is not None: - add_vectors(nlp, vectors_loc, prune_vectors, vectors_name) + add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( @@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None): return nlp -def add_vectors(nlp, vectors_loc, prune_vectors, name=None): +def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): else: if vectors_loc: with msg.loading("Reading vectors from {}".format(vectors_loc)): - vectors_data, vector_keys = read_vectors(vectors_loc) + vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors) msg.good("Loaded vectors from {}".format(vectors_loc)) else: vectors_data, vector_keys = (None, None) @@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): nlp.vocab.prune_vectors(prune_vectors) -def read_vectors(vectors_loc): +def read_vectors(vectors_loc, truncate_vectors=0): f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) vectors_data = numpy.zeros(shape=shape, dtype="f") vectors_keys = [] for i, line in enumerate(tqdm(f)): @@ -212,6 +221,8 @@ def read_vectors(vectors_loc): msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) vectors_data[i] = numpy.asarray(pieces, dtype="f") vectors_keys.append(word) + if i == truncate_vectors - 1: + break return vectors_data, vectors_keys diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8fc475d24..db58b22df 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -454,22 +454,25 @@ def train( cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) - with Model.use_device("cpu"): - nlp_loaded = util.load_model_from_path(epoch_model_path) - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_docs = list( - corpus.dev_docs( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, + # Only evaluate on CPU in the first iteration (for + # timing) if GPU is enabled + if i >= 1: + with Model.use_device("cpu"): + nlp_loaded = util.load_model_from_path(epoch_model_path) + for name, component in nlp_loaded.pipeline: + if hasattr(component, "cfg"): + component.cfg["beam_width"] = beam_width + dev_docs = list( + corpus.dev_docs( + nlp_loaded, + gold_preproc=gold_preproc, + ignore_misaligned=True, + ) ) - ) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) - end_time = timer() - cpu_wps = nwords / (end_time - start_time) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) + end_time = timer() + cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) @@ -550,7 +553,8 @@ def train( except Exception as e: msg.warn( "Aborting and saving the final best model. " - "Encountered exception: {}".format(e) + "Encountered exception: {}".format(e), + exits=1, ) finally: best_pipes = nlp.pipe_names diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 15691c4f8..505977be9 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] | `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | | `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | | `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | | `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | | **CREATES** | model | A spaCy model containing the vocab and vectors. | From 3f43c73d37a5c175d0eabb35b9627a18aacd782a Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:57:30 +0200 Subject: [PATCH 101/105] Normalize TokenC.sent_start values for Matcher (#5346) Normalize TokenC.sent_start values to booleans for the `Matcher`. --- spacy/matcher/matcher.pyx | 10 +++++----- spacy/tokens/doc.pxd | 1 + spacy/tokens/doc.pyx | 10 ++++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 7f3c3488f..4cfab915f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -14,7 +14,7 @@ import warnings from ..typedefs cimport attr_t from ..structs cimport TokenC from ..vocab cimport Vocab -from ..tokens.doc cimport Doc, get_token_attr +from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA @@ -549,7 +549,7 @@ cdef char get_is_match(PatternStateC state, spec = state.pattern if spec.nr_attr > 0: for attr in spec.attrs[:spec.nr_attr]: - if get_token_attr(token, attr.attr) != attr.value: + if get_token_attr_for_matcher(token, attr.attr) != attr.value: return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: @@ -720,7 +720,7 @@ class _RegexPredicate(object): if self.is_extension: value = token._.get(self.attr) else: - value = token.vocab.strings[get_token_attr(token.c, self.attr)] + value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] return bool(self.value.search(value)) @@ -741,7 +741,7 @@ class _SetMemberPredicate(object): if self.is_extension: value = get_string_id(token._.get(self.attr)) else: - value = get_token_attr(token.c, self.attr) + value = get_token_attr_for_matcher(token.c, self.attr) if self.predicate == "IN": return value in self.value else: @@ -768,7 +768,7 @@ class _ComparisonPredicate(object): if self.is_extension: value = token._.get(self.attr) else: - value = get_token_attr(token.c, self.attr) + value = get_token_attr_for_matcher(token.c, self.attr) if self.predicate == "==": return value == self.value if self.predicate == "!=": diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7f231887f..6536d271d 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -8,6 +8,7 @@ from ..attrs cimport attr_id_t cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil +cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 867c2bf6b..4dc438695 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -79,6 +79,16 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return Lexeme.get_struct_attr(token.lex, feat_name) +cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil: + if feat_name == SENT_START: + if token.sent_start == 1: + return True + else: + return False + else: + return get_token_attr(token, feat_name) + + def _get_chunker(lang): try: cls = util.get_lang_class(lang) From 74da669326eaa45d878d303643abe88cf4c84d60 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 13:01:25 +0200 Subject: [PATCH 102/105] Fix problems with lower and whitespace in variants (#5361) * Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants * Return the text with original casing if anything goes wrong --- spacy/gold.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index e8274563f..034bba08f 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -331,6 +331,8 @@ class GoldCorpus(object): def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw, paragraph_tuples + raw_orig = str(raw) + lower = False if random.random() >= 0.5: lower = True if raw is not None: @@ -391,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): ids, words, tags, heads, labels, ner = sent_tuples for word in words: match_found = False + # skip whitespace words + if word.isspace(): + match_found = True # add identical word - if word not in variants and raw[raw_idx:].startswith(word): + elif word not in variants and raw[raw_idx:].startswith(word): variant_raw += word raw_idx += len(word) match_found = True @@ -407,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: - return raw, paragraph_tuples + return raw_orig, paragraph_tuples # add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] From 8602daba85bc412918e5cca2101ec15d22b950ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?= Date: Wed, 29 Apr 2020 21:25:22 +0200 Subject: [PATCH 103/105] Swedish like_num (#5371) * Sign contributor agreement. * Add like_num functionality to Swedish. * Update spacy/tests/lang/sv/test_lex_attrs.py Co-Authored-By: Sofie Van Landeghem * Update contributor agreement Co-authored-by: Sofie Van Landeghem --- .github/contributors/vondersam.md | 106 ++++++++++++++++++++++++++ spacy/lang/sv/__init__.py | 2 + spacy/lang/sv/lex_attrs.py | 62 +++++++++++++++ spacy/tests/lang/sv/test_lex_attrs.py | 33 ++++++++ 4 files changed, 203 insertions(+) create mode 100644 .github/contributors/vondersam.md create mode 100644 spacy/lang/sv/lex_attrs.py create mode 100644 spacy/tests/lang/sv/test_lex_attrs.py diff --git a/.github/contributors/vondersam.md b/.github/contributors/vondersam.md new file mode 100644 index 000000000..8add70330 --- /dev/null +++ b/.github/contributors/vondersam.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------| +| Name | Samuel Rodríguez Medina | +| Company name (if applicable) | | +| Title or role (if applicable) | Computational linguist | +| Date | 28 April 2020 | +| GitHub username | vondersam | +| Website (optional) | | diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 671eefca0..3a749eeee 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES # Punctuation stolen from Danish @@ -19,6 +20,7 @@ from .syntax_iterators import SYNTAX_ITERATORS class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "sv" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py new file mode 100644 index 000000000..4b5278c7b --- /dev/null +++ b/spacy/lang/sv/lex_attrs.py @@ -0,0 +1,62 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "noll", + "en", + "ett", + "två", + "tre", + "fyra", + "fem", + "sex", + "sju", + "åtta", + "nio", + "tio", + "elva", + "tolv", + "tretton", + "fjorton", + "femton", + "sexton", + "sjutton", + "arton", + "nitton", + "tjugo", + "trettio", + "fyrtio", + "femtio", + "sextio", + "sjuttio", + "åttio", + "nittio", + "hundra", + "tusen", + "miljon", + "miljard", + "biljon", + "biljard", + "kvadriljon" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py new file mode 100644 index 000000000..abe6b0f7b --- /dev/null +++ b/spacy/tests/lang/sv/test_lex_attrs.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.sv.lex_attrs import like_num + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("10.00", True), + ("999,0", True), + ("en", True), + ("två", True), + ("miljard", True), + ("hund", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(sv_tokenizer, text, match): + tokens = sv_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["elva"]) +def test_sv_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) From 148b036e0cae9eebb6968cea5ecede1ebc7205a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?= Date: Thu, 30 Apr 2020 11:13:23 +0200 Subject: [PATCH 104/105] Spanish like num improvement (#5381) * Add tests for Spanish like_num. * Add missing numbers in Spanish lexical attributes for like_num. * Modify Spanish test function name. * Add contributor agreement. --- spacy/lang/es/lex_attrs.py | 9 +++++++++ spacy/tests/lang/es/test_text.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 03ada1f43..632a638fc 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -26,6 +26,15 @@ _num_words = [ "dieciocho", "diecinueve", "veinte", + "veintiuno", + "veintidós", + "veintitrés", + "veinticuatro", + "veinticinco", + "veintiséis", + "veintisiete", + "veintiocho", + "veintinueve", "treinta", "cuarenta", "cincuenta", diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index acd572b48..e237f922d 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +from spacy.lang.es.lex_attrs import like_num def test_es_tokenizer_handles_long_text(es_tokenizer): @@ -33,3 +34,32 @@ en Montevideo y que pregona las bondades de la vida austera.""" def test_es_tokenizer_handles_cnts(es_tokenizer, text, length): tokens = es_tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("1000", True), + ("999,0", True), + ("uno", True), + ("dos", True), + ("billón", True), + ("veintiséis", True), + ("perro", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(es_tokenizer, text, match): + tokens = es_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["once"]) +def test_es_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) \ No newline at end of file From c045a9c7f637f85f7beccdae48a4cb765516d558 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 1 May 2020 12:05:33 +0200 Subject: [PATCH 105/105] Fix logic in train CLI timing eval on GPU (#5387) Run CPU timing in first iteration only --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index db58b22df..6e6423131 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -456,7 +456,7 @@ def train( gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled - if i >= 1: + if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: