diff --git a/.github/contributors/Jan-711.md b/.github/contributors/Jan-711.md new file mode 100644 index 000000000..60297640c --- /dev/null +++ b/.github/contributors/Jan-711.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jan Jessewitsch | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 16.02.2020 | +| GitHub username | Jan-711 | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index c4ad59fc7..a0af6d4d2 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ __pycache__/ .env* .~env/ .venv +env3.6/ venv/ .dev .denv @@ -111,3 +112,6 @@ Desktop.ini # Pycharm project files *.idea + +# IPython +.ipynb_checkpoints/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e3ce53024..000000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: python -sudo: false -cache: pip -dist: trusty -group: edge -python: - - "2.7" -os: - - linux -install: - - "pip install -r requirements.txt" - - "python setup.py build_ext --inplace" - - "pip install -e ." -script: - - "cat /proc/cpuinfo | grep flags | head -n 1" - - "python -m pytest --tb=native spacy" -branches: - except: - - spacy.io -notifications: - slack: - secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ= - email: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3c2b56cd3..6b7881dd2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -280,23 +280,7 @@ except: # noqa: E722 ### Python conventions -All Python code must be written in an **intersection of Python 2 and Python 3**. -This is easy in Cython, but somewhat ugly in Python. Logic that deals with -Python or platform compatibility should only live in -[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin -functions, replacement functions are suffixed with an underscore, for example -`unicode_`. If you need to access the user's version or platform information, -for example to show more specific error messages, you can use the `is_config()` -helper function. - -```python -from .compat import unicode_, is_config - -compatible_unicode = unicode_('hello world') -if is_config(windows=True, python2=True): - print("You are using Python 2 on Windows.") -``` - +All Python code must be written **compatible with Python 3.6+**. Code that interacts with the file-system should accept objects that follow the `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`. If the function is user-facing and takes a path as an argument, it should check diff --git a/README.md b/README.md index 31dc78d63..500431b9f 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license. [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines]()](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) -[![Travis Build Status]()](https://travis-ci.org/explosion/spaCy) [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases) [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy) @@ -98,12 +97,19 @@ For detailed installation instructions, see the - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio) -- **Python version**: Python 2.7, 3.5+ (only 64 bit) +- **Python version**: Python 3.6+ (only 64 bit) - **Package managers**: [pip] · [conda] (via `conda-forge`) [pip]: https://pypi.org/project/spacy/ [conda]: https://anaconda.org/conda-forge/spacy +> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary +> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI +> providers and other tooling to support it. This means that in order to run +> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile +> the library and its Cython dependencies locally. If this is causing problems +> for you, the easiest solution is to **use Python 3.7** in the meantime. + ### pip Using pip, spaCy releases are available as source packages and binary wheels (as @@ -262,9 +268,7 @@ and git preinstalled. Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that -matches the version that was used to compile your Python interpreter. For -official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and -VS 2015 (Python 3.5). +matches the version that was used to compile your Python interpreter. ## Run tests diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 779037c96..5a5e8f03a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,12 +35,6 @@ jobs: dependsOn: 'Validate' strategy: matrix: - Python35Linux: - imageName: 'ubuntu-16.04' - python.version: '3.5' - Python35Windows: - imageName: 'vs2017-win2016' - python.version: '3.5' Python36Linux: imageName: 'ubuntu-16.04' python.version: '3.6' diff --git a/bin/cythonize.py b/bin/cythonize.py deleted file mode 100755 index 4814f8df0..000000000 --- a/bin/cythonize.py +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env python -""" cythonize.py - -Cythonize pyx files into C++ files as needed. - -Usage: cythonize.py [root] - -Checks pyx files to see if they have been changed relative to their -corresponding C++ files. If they have, then runs cython on these files to -recreate the C++ files. - -Additionally, checks pxd files and setup.py if they have been changed. If -they have, rebuilds everything. - -Change detection based on file hashes stored in JSON format. - -For now, this script should be run by developers when changing Cython files -and the resulting C++ files checked in, so that end-users (and Python-only -developers) do not get the Cython dependencies. - -Based upon: - -https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py -https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py - -Note: this script does not check any of the dependent C++ libraries. -""" -from __future__ import print_function - -import os -import sys -import json -import hashlib -import subprocess -import argparse - - -HASH_FILE = "cythonize.json" - - -def process_pyx(fromfile, tofile, language_level="-2"): - print("Processing %s" % fromfile) - try: - from Cython.Compiler.Version import version as cython_version - from distutils.version import LooseVersion - - if LooseVersion(cython_version) < LooseVersion("0.19"): - raise Exception("Require Cython >= 0.19") - - except ImportError: - pass - - flags = ["--fast-fail", language_level] - if tofile.endswith(".cpp"): - flags += ["--cplus"] - - try: - try: - r = subprocess.call( - ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ - ) # See Issue #791 - if r != 0: - raise Exception("Cython failed") - except OSError: - # There are ways of installing Cython that don't result in a cython - # executable on the path, see gh-2397. - r = subprocess.call( - [ - sys.executable, - "-c", - "import sys; from Cython.Compiler.Main import " - "setuptools_main as main; sys.exit(main())", - ] - + flags - + ["-o", tofile, fromfile] - ) - if r != 0: - raise Exception("Cython failed") - except OSError: - raise OSError("Cython needs to be installed") - - -def preserve_cwd(path, func, *args): - orig_cwd = os.getcwd() - try: - os.chdir(path) - func(*args) - finally: - os.chdir(orig_cwd) - - -def load_hashes(filename): - try: - return json.load(open(filename)) - except (ValueError, IOError): - return {} - - -def save_hashes(hash_db, filename): - with open(filename, "w") as f: - f.write(json.dumps(hash_db)) - - -def get_hash(path): - return hashlib.md5(open(path, "rb").read()).hexdigest() - - -def hash_changed(base, path, db): - full_path = os.path.normpath(os.path.join(base, path)) - return not get_hash(full_path) == db.get(full_path) - - -def hash_add(base, path, db): - full_path = os.path.normpath(os.path.join(base, path)) - db[full_path] = get_hash(full_path) - - -def process(base, filename, db): - root, ext = os.path.splitext(filename) - if ext in [".pyx", ".cpp"]: - if hash_changed(base, filename, db) or not os.path.isfile( - os.path.join(base, root + ".cpp") - ): - preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp") - hash_add(base, root + ".cpp", db) - hash_add(base, root + ".pyx", db) - - -def check_changes(root, db): - res = False - new_db = {} - - setup_filename = "setup.py" - hash_add(".", setup_filename, new_db) - if hash_changed(".", setup_filename, db): - res = True - - for base, _, files in os.walk(root): - for filename in files: - if filename.endswith(".pxd"): - hash_add(base, filename, new_db) - if hash_changed(base, filename, db): - res = True - - if res: - db.clear() - db.update(new_db) - return res - - -def run(root): - db = load_hashes(HASH_FILE) - - try: - check_changes(root, db) - for base, _, files in os.walk(root): - for filename in files: - process(base, filename, db) - finally: - save_hashes(db, HASH_FILE) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Cythonize pyx files into C++ files as needed" - ) - parser.add_argument("root", help="root directory") - args = parser.parse_args() - run(args.root) diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py index 7cb270d84..70c6be0d0 100644 --- a/bin/ud/ud_run_test.py +++ b/bin/ud/ud_run_test.py @@ -13,23 +13,12 @@ import srsly import spacy import spacy.util from spacy.tokens import Token, Doc -from spacy.gold import GoldParse -from spacy.util import compounding, minibatch_by_words -from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher -# from spacy.morphology import Fused_begin, Fused_inside -from spacy import displacy -from collections import defaultdict, Counter -from timeit import default_timer as timer Fused_begin = None Fused_inside = None -import itertools -import random -import numpy.random - from . import conll17_ud_eval from spacy import lang @@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus): return nlp -def initialize_pipeline(nlp, docs, golds, config, device): +def initialize_pipeline(nlp, examples, config, device): nlp.add_pipe(nlp.create_pipe("parser")) return nlp diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 6353bd6e7..bda22088d 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -14,7 +14,7 @@ import spacy import spacy.util from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc -from spacy.gold import GoldParse +from spacy.gold import GoldParse, Example from spacy.util import compounding, minibatch, minibatch_by_words from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher @@ -53,7 +53,7 @@ def read_data( max_doc_length=None, limit=None, ): - """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, + """Read the CONLLU format into Example objects. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.""" @@ -98,15 +98,16 @@ def read_data( docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds + return golds_to_gold_data(docs, golds) if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds - return docs, golds + return golds_to_gold_data(docs, golds) + return golds_to_gold_data(docs, golds) + def _parse_morph_string(morph_string): if morph_string == '_': @@ -120,6 +121,7 @@ def _parse_morph_string(morph_string): output.append('%s_%s' % (key, value.lower())) return set(output) + def read_conllu(file_): docs = [] sent = [] @@ -180,16 +182,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): ############################# -def golds_to_gold_tuples(docs, golds): - """Get out the annoying 'tuples' format used by begin_training, given the +def golds_to_gold_data(docs, golds): + """Get out the training data format used by begin_training, given the GoldParse objects.""" - tuples = [] + data = [] for doc, gold in zip(docs, golds): - text = doc.text - ids, words, tags, heads, labels, iob = zip(*gold.orig_annot) - sents = [((ids, words, tags, heads, labels, iob), [])] - tuples.append((text, sents)) - return tuples + example = Example(doc=doc) + example.add_doc_annotation(cats=gold.cats) + token_annotation_dict = gold.orig.to_dict() + example.add_token_annotation(**token_annotation_dict) + example.goldparse = gold + data.append(example) + return data ############## @@ -327,7 +331,6 @@ def get_token_conllu(token, i): return "\n".join(lines) - ################## # Initialization # ################## @@ -348,7 +351,7 @@ def load_nlp(corpus, config, vectors=None): return nlp -def initialize_pipeline(nlp, docs, golds, config, device): +def initialize_pipeline(nlp, examples, config, device): nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False})) nlp.add_pipe(nlp.create_pipe("morphologizer")) nlp.add_pipe(nlp.create_pipe("parser")) @@ -356,14 +359,15 @@ def initialize_pipeline(nlp, docs, golds, config, device): nlp.parser.add_multitask_objective("tag") if config.multitask_sent: nlp.parser.add_multitask_objective("sent_start") - for gold in golds: + for ex in examples: + gold = ex.gold for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) if torch is not None and device != -1: torch.set_default_tensor_type("torch.cuda.FloatTensor") optimizer = nlp.begin_training( - lambda: golds_to_gold_tuples(docs, golds), + lambda: examples, device=device, subword_features=config.subword_features, conv_depth=config.conv_depth, @@ -491,6 +495,10 @@ def main( Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False @@ -505,7 +513,7 @@ def main( print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config, vectors=vectors_dir) - docs, golds = read_data( + examples = read_data( nlp, paths.train.conllu.open(encoding="utf8"), paths.train.text.open(encoding="utf8"), @@ -513,12 +521,12 @@ def main( limit=limit, ) - optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) + optimizer = initialize_pipeline(nlp, examples, config, gpu_device) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) beam_prob = compounding(0.2, 0.8, 1.001) for i in range(config.nr_epoch): - docs, golds = read_data( + examples = read_data( nlp, paths.train.conllu.open(encoding="utf8"), paths.train.text.open(encoding="utf8"), @@ -527,22 +535,19 @@ def main( oracle_segments=use_oracle_segments, raw_text=not use_oracle_segments, ) - Xs = list(zip(docs, golds)) - random.shuffle(Xs) + random.shuffle(examples) if config.batch_by_words: - batches = minibatch_by_words(Xs, size=batch_sizes) + batches = minibatch_by_words(examples, size=batch_sizes) else: - batches = minibatch(Xs, size=batch_sizes) + batches = minibatch(examples, size=batch_sizes) losses = {} - n_train_words = sum(len(doc) for doc in docs) + n_train_words = sum(len(ex.doc) for ex in examples) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: - batch_docs, batch_gold = zip(*batch) - pbar.update(sum(len(doc) for doc in batch_docs)) + pbar.update(sum(len(ex.doc) for ex in batch)) nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.update( - batch_docs, - batch_gold, + batch, sgd=optimizer, drop=config.dropout, losses=losses, diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index 7778fc701..8691308e0 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -46,7 +46,7 @@ def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_fre " cf. https://spacy.io/usage/models#languages." ) - logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq)) + logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq)) entity_frequencies = io.read_entity_to_count(entity_freq_path) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index af08d6b8f..d98bba565 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -4,12 +4,12 @@ from random import shuffle import logging import numpy as np -from spacy._ml import zero_init, create_default_optimizer -from spacy.cli.pretrain import get_cossim_loss - -from thinc.v2v import Model +from thinc.model import Model from thinc.api import chain -from thinc.neural._classes.affine import Affine +from thinc.loss import CosineDistance +from thinc.layers import Linear + +from spacy.util import create_default_optimizer logger = logging.getLogger(__name__) @@ -34,6 +34,7 @@ class EntityEncoder: self.input_dim = input_dim self.desc_width = desc_width self.epochs = epochs + self.distance = CosineDistance(ignore_zeros=True, normalize=False) def apply_encoder(self, description_list): if self.encoder is None: @@ -132,21 +133,17 @@ class EntityEncoder: def _build_network(self, orig_width, hidden_with): with Model.define_operators({">>": chain}): # very simple encoder-decoder model - self.encoder = Affine(hidden_with, orig_width) - self.model = self.encoder >> zero_init( - Affine(orig_width, hidden_with, drop_factor=0.0) - ) - self.sgd = create_default_optimizer(self.model.ops) + self.encoder = Linear(hidden_with, orig_width) + # TODO: removed the zero_init here - is oK? + self.model = self.encoder >> Linear(orig_width, hidden_with) + self.sgd = create_default_optimizer() def _update(self, vectors): + truths = self.model.ops.asarray(vectors) predictions, bp_model = self.model.begin_update( - np.asarray(vectors), drop=self.DROP + truths, drop=self.DROP ) - loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) + d_scores, loss = self.distance(predictions, truths) bp_model(d_scores, sgd=self.sgd) return loss / len(vectors) - @staticmethod - def _get_loss(golds, scores): - loss, gradients = get_cossim_loss(scores, golds) - return loss, gradients diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py index 54f00fc6f..386af7d4d 100644 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py @@ -17,7 +17,13 @@ import plac from tqdm import tqdm from bin.wiki_entity_linking import wikipedia_processor -from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR +from bin.wiki_entity_linking import ( + TRAINING_DATA_FILE, + KB_MODEL_DIR, + KB_FILE, + LOG_FORMAT, + OUTPUT_MODEL_DIR, +) from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance from bin.wiki_entity_linking.kb_creator import read_kb @@ -48,10 +54,12 @@ def main( l2=1e-6, train_articles=None, dev_articles=None, - labels_discard=None + labels_discard=None, ): if not output_dir: - logger.warning("No output dir specified so no results will be written, are you sure about this ?") + logger.warning( + "No output dir specified so no results will be written, are you sure about this ?" + ) logger.info("Creating Entity Linker with Wikipedia and WikiData") @@ -68,7 +76,11 @@ def main( # STEP 1 : load the NLP object logger.info("STEP 1a: Loading model from {}".format(nlp_dir)) nlp = spacy.load(nlp_dir) - logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names)) + logger.info( + "Original NLP pipeline has following pipeline components: {}".format( + nlp.pipe_names + ) + ) # check that there is a NER component in the pipeline if "ner" not in nlp.pipe_names: @@ -79,25 +91,42 @@ def main( # STEP 2: read the training dataset previously created from WP logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path)) - train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path) - logger.info("Training set has {} articles, limit set to roughly {} articles per epoch" - .format(len(train_indices), train_articles if train_articles else "all")) - logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation" - .format(len(dev_indices), dev_articles if dev_articles else "all")) + train_indices, dev_indices = wikipedia_processor.read_training_indices( + training_path + ) + logger.info( + "Training set has {} articles, limit set to roughly {} articles per epoch".format( + len(train_indices), train_articles if train_articles else "all" + ) + ) + logger.info( + "Dev set has {} articles, limit set to rougly {} articles for evaluation".format( + len(dev_indices), dev_articles if dev_articles else "all" + ) + ) if dev_articles: dev_indices = dev_indices[0:dev_articles] # STEP 3: create and train an entity linking pipe - logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs)) + logger.info( + "STEP 3: Creating and training an Entity Linking pipe for {} epochs".format( + epochs + ) + ) if labels_discard: labels_discard = [x.strip() for x in labels_discard.split(",")] - logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard)) + logger.info( + "Discarding {} NER types: {}".format(len(labels_discard), labels_discard) + ) else: labels_discard = [] el_pipe = nlp.create_pipe( - name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name, - "labels_discard": labels_discard} + name="entity_linker", + config={ + "pretrained_vectors": nlp.vocab.vectors, + "labels_discard": labels_discard, + }, ) el_pipe.set_kb(kb) nlp.add_pipe(el_pipe, last=True) @@ -109,11 +138,18 @@ def main( optimizer.L2 = l2 logger.info("Dev Baseline Accuracies:") - dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=True, line_ids=dev_indices, - kb=kb, labels_discard=labels_discard) + dev_data = wikipedia_processor.read_el_docs_golds( + nlp=nlp, + entity_file_path=training_path, + dev=True, + line_ids=dev_indices, + kb=kb, + labels_discard=labels_discard, + ) - measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices)) + measure_performance( + dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices) + ) for itn in range(epochs): random.shuffle(train_indices) @@ -127,13 +163,18 @@ def main( if train_articles: bar_total = train_articles - with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar: + with tqdm(total=bar_total, leave=False, desc=f"Epoch {itn}") as pbar: for batch in batches: if not train_articles or articles_processed < train_articles: with nlp.disable_pipes("entity_linker"): - train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=False, line_ids=batch, - kb=kb, labels_discard=labels_discard) + train_batch = wikipedia_processor.read_el_docs_golds( + nlp=nlp, + entity_file_path=training_path, + dev=False, + line_ids=batch, + kb=kb, + labels_discard=labels_discard, + ) docs, golds = zip(*train_batch) try: with nlp.disable_pipes(*other_pipes): @@ -150,17 +191,36 @@ def main( except Exception as e: logger.error("Error updating batch:" + str(e)) if batchnr > 0: - logging.info("Epoch {} trained on {} articles, train loss {}" - .format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2))) + logging.info( + "Epoch {} trained on {} articles, train loss {}".format( + itn, articles_processed, round(losses["entity_linker"] / batchnr, 2) + ) + ) # re-read the dev_data (data is returned as a generator) - dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path, - dev=True, line_ids=dev_indices, - kb=kb, labels_discard=labels_discard) - measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices)) + dev_data = wikipedia_processor.read_el_docs_golds( + nlp=nlp, + entity_file_path=training_path, + dev=True, + line_ids=dev_indices, + kb=kb, + labels_discard=labels_discard, + ) + measure_performance( + dev_data, + kb, + el_pipe, + baseline=False, + context=True, + dev_limit=len(dev_indices), + ) if output_dir: # STEP 4: write the NLP pipeline (now including an EL model) to file - logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names)) + logger.info( + "Final NLP pipeline has following pipeline components: {}".format( + nlp.pipe_names + ) + ) logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir)) nlp.to_disk(nlp_output_dir) diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py index 049cc0be4..bf857b8b7 100644 --- a/examples/deep_learning_keras.py +++ b/examples/deep_learning_keras.py @@ -14,7 +14,7 @@ pip install keras==2.0.9 Compatible with: spaCy v2.0.0+ """ - +import ml_datasets import plac import random import pathlib @@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json from keras.layers import LSTM, Dense, Embedding, Bidirectional from keras.layers import TimeDistributed from keras.optimizers import Adam -import thinc.extra.datasets from spacy.compat import pickle import spacy @@ -224,7 +223,7 @@ def main( if model_dir is not None: model_dir = pathlib.Path(model_dir) if train_dir is None or dev_dir is None: - imdb_data = thinc.extra.datasets.imdb() + imdb_data = ml_datasets.imdb() if is_runtime: if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg new file mode 100644 index 000000000..8cd150868 --- /dev/null +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -0,0 +1,63 @@ +[training] +patience = 10000 +eval_frequency = 200 +dropout = 0.2 +init_tok2vec = null +vectors = null +max_epochs = 100 +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +use_gpu = 0 +scores = ["tags_acc", "uas", "las"] +score_weights = {"las": 0.8, "tags_acc": 0.2} +limit = 0 + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.tagger.model] +@architectures = "tagger_model.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "transition_based_parser.v1" +nr_feature_tokens = 8 +hidden_width = 64 +maxout_pieces = 3 + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "hash_embed_bilstm.v1" +pretrained_vectors = ${nlp:vectors} +width = 96 +depth = 4 +embed_size = 2000 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg new file mode 100644 index 000000000..6735284a7 --- /dev/null +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -0,0 +1,65 @@ +[training] +patience = 10000 +eval_frequency = 200 +dropout = 0.2 +init_tok2vec = null +vectors = null +max_epochs = 100 +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +use_gpu = -1 +scores = ["tags_acc", "uas", "las"] +score_weights = {"las": 0.8, "tags_acc": 0.2} +limit = 0 + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.tagger.model] +@architectures = "tagger_model.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "transition_based_parser.v1" +nr_feature_tokens = 8 +hidden_width = 64 +maxout_pieces = 3 + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "hash_embed_cnn.v1" +pretrained_vectors = ${nlp:vectors} +width = 96 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py index f0e437acf..e4aca7912 100644 --- a/examples/pipeline/multi_processing.py +++ b/examples/pipeline/multi_processing.py @@ -13,9 +13,10 @@ Prerequisites: pip install joblib from __future__ import print_function, unicode_literals from pathlib import Path + +import ml_datasets from joblib import Parallel, delayed from functools import partial -import thinc.extra.datasets import plac import spacy from spacy.util import minibatch @@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10 output_dir.mkdir() # load and pre-process the IMBD dataset print("Loading IMDB data...") - data, _ = thinc.extra.datasets.imdb() + data, _ = ml_datasets.imdb() texts, _ = zip(*data[-limit:]) print("Processing texts...") partitions = minibatch(texts, size=batch_size) diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py index 1afa1cd32..a2da123c2 100644 --- a/examples/streamlit_spacy.py +++ b/examples/streamlit_spacy.py @@ -26,12 +26,12 @@ DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." HTML_WRAPPER = """
{}
""" -@st.cache(ignore_hash=True) +@st.cache(allow_output_mutation=True) def load_model(name): return spacy.load(name) -@st.cache(ignore_hash=True) +@st.cache(allow_output_mutation=True) def process_text(model_name, text): nlp = load_model(model_name) return nlp(text) @@ -79,7 +79,9 @@ if "ner" in nlp.pipe_names: st.header("Named Entities") st.sidebar.header("Named Entities") label_set = nlp.get_pipe("ner").labels - labels = st.sidebar.multiselect("Entity labels", label_set, label_set) + labels = st.sidebar.multiselect( + "Entity labels", options=label_set, default=list(label_set) + ) html = displacy.render(doc, style="ent", options={"ents": labels}) # Newlines seem to mess with the rendering html = html.replace("\n", " ") diff --git a/examples/training/conllu.py b/examples/training/conllu.py index 1c65f4a72..bf47be72a 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -12,7 +12,7 @@ import tqdm import spacy import spacy.util from spacy.tokens import Token, Doc -from spacy.gold import GoldParse +from spacy.gold import GoldParse, Example from spacy.syntax.nonproj import projectivize from collections import defaultdict from spacy.matcher import Matcher @@ -33,25 +33,25 @@ random.seed(0) numpy.random.seed(0) -def minibatch_by_words(items, size=5000): - random.shuffle(items) +def minibatch_by_words(examples, size=5000): + random.shuffle(examples) if isinstance(size, int): size_ = itertools.repeat(size) else: size_ = size - items = iter(items) + examples = iter(examples) while True: batch_size = next(size_) batch = [] while batch_size >= 0: try: - doc, gold = next(items) + example = next(examples) except StopIteration: if batch: yield batch return - batch_size -= len(doc) - batch.append((doc, gold)) + batch_size -= len(example.doc) + batch.append(example) if batch: yield batch else: @@ -78,7 +78,7 @@ def read_data( max_doc_length=None, limit=None, ): - """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, + """Read the CONLLU format into Example objects. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.""" @@ -119,15 +119,15 @@ def read_data( docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds + return golds_to_gold_data(docs, golds) if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds - return docs, golds + return golds_to_gold_data(docs, golds) + return golds_to_gold_data(docs, golds) def read_conllu(file_): @@ -181,16 +181,18 @@ def _make_gold(nlp, text, sent_annots): ############################# -def golds_to_gold_tuples(docs, golds): - """Get out the annoying 'tuples' format used by begin_training, given the +def golds_to_gold_data(docs, golds): + """Get out the training data format used by begin_training, given the GoldParse objects.""" - tuples = [] + data = [] for doc, gold in zip(docs, golds): - text = doc.text - ids, words, tags, heads, labels, iob = zip(*gold.orig_annot) - sents = [((ids, words, tags, heads, labels, iob), [])] - tuples.append((text, sents)) - return tuples + example = Example(doc=doc) + example.add_doc_annotation(cats=gold.cats) + token_annotation_dict = gold.orig.to_dict() + example.add_token_annotation(**token_annotation_dict) + example.goldparse = gold + data.append(example) + return data ############## @@ -303,7 +305,7 @@ def load_nlp(corpus, config): return nlp -def initialize_pipeline(nlp, docs, golds, config): +def initialize_pipeline(nlp, examples, config): nlp.add_pipe(nlp.create_pipe("parser")) if config.multitask_tag: nlp.parser.add_multitask_objective("tag") @@ -311,18 +313,19 @@ def initialize_pipeline(nlp, docs, golds, config): nlp.parser.add_multitask_objective("sent_start") nlp.parser.moves.add_action(2, "subtok") nlp.add_pipe(nlp.create_pipe("tagger")) - for gold in golds: - for tag in gold.tags: + for ex in examples: + for tag in ex.gold.tags: if tag is not None: nlp.tagger.add_label(tag) # Replace labels that didn't make the frequency cutoff actions = set(nlp.parser.labels) label_set = set([act.split("-")[1] for act in actions if "-" in act]) - for gold in golds: + for ex in examples: + gold = ex.gold for i, label in enumerate(gold.labels): if label is not None and label not in label_set: gold.labels[i] = label.split("||")[0] - return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) + return nlp.begin_training(lambda: examples) ######################## @@ -391,13 +394,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config) - docs, golds = read_data( + examples = read_data( nlp, paths.train.conllu.open(encoding="utf8"), paths.train.text.open(encoding="utf8"), @@ -405,23 +412,18 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): limit=limit, ) - optimizer = initialize_pipeline(nlp, docs, golds, config) + optimizer = initialize_pipeline(nlp, examples, config) for i in range(config.nr_epoch): - docs = [nlp.make_doc(doc.text) for doc in docs] - batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size) + docs = [nlp.make_doc(example.doc.text) for example in examples] + batches = minibatch_by_words(examples, size=config.batch_size) losses = {} n_train_words = sum(len(doc) for doc in docs) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: - batch_docs, batch_gold = zip(*batch) - pbar.update(sum(len(doc) for doc in batch_docs)) + pbar.update(sum(len(ex.doc) for ex in batch)) nlp.update( - batch_docs, - batch_gold, - sgd=optimizer, - drop=config.dropout, - losses=losses, + examples=batch, sgd=optimizer, drop=config.dropout, losses=losses, ) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py index 4bf7a008f..7561d4877 100644 --- a/examples/training/ner_multitask_objective.py +++ b/examples/training/ner_multitask_objective.py @@ -31,14 +31,13 @@ random.seed(0) PWD = os.path.dirname(__file__) -TRAIN_DATA = list(read_json_file( - os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json"))) +TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json"))) -def get_position_label(i, words, tags, heads, labels, ents): +def get_position_label(i, token_annotation): """Return labels indicating the position of the word in the document. """ - if len(words) < 20: + if len(token_annotation.words) < 20: return "short-doc" elif i == 0: return "first-word" @@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents): return "early-word" elif i < 20: return "mid-word" - elif i == len(words) - 1: + elif i == len(token_annotation.words) - 1: return "last-word" else: return "late-word" @@ -60,17 +59,17 @@ def main(n_iter=10): print(nlp.pipeline) print("Create data", len(TRAIN_DATA)) - optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) + optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annot_brackets in TRAIN_DATA: - for annotations, _ in annot_brackets: - doc = Doc(nlp.vocab, words=annotations[1]) - gold = GoldParse.from_annot_tuples(doc, annotations) + for example in TRAIN_DATA: + for token_annotation in example.token_annotations: + doc = Doc(nlp.vocab, words=token_annotation.words) + gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation) + nlp.update( - [doc], # batch of texts - [gold], # batch of annotations + examples=[(doc, gold)], # 1 example drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses, @@ -78,9 +77,9 @@ def main(n_iter=10): print(losses.get("nn_labeller", 0.0), losses["ner"]) # test the trained model - for text, _ in TRAIN_DATA: - if text is not None: - doc = nlp(text) + for example in TRAIN_DATA: + if example.text is not None: + doc = nlp(example.text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index f3e493f6a..85d36fd66 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -16,16 +16,18 @@ the development labels, after all --- only the unlabelled text. import plac import tqdm import random + +import ml_datasets + import spacy -import thinc.extra.datasets from spacy.util import minibatch, use_gpu, compounding -from spacy._ml import Tok2Vec from spacy.pipeline import TextCategorizer +from spacy.ml.tok2vec import Tok2Vec import numpy def load_texts(limit=0): - train, dev = thinc.extra.datasets.imdb() + train, dev = ml_datasets.imdb() train_texts, train_labels = zip(*train) dev_texts, dev_labels = zip(*train) train_texts = list(train_texts) @@ -41,7 +43,7 @@ def load_texts(limit=0): def load_textcat_data(limit=0): """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation - train_data, eval_data = thinc.extra.datasets.imdb() + train_data, eval_data = ml_datasets.imdb() random.shuffle(train_data) train_data = train_data[-limit:] texts, labels = zip(*train_data) @@ -63,17 +65,15 @@ def prefer_gpu(): def build_textcat_model(tok2vec, nr_class, width): - from thinc.v2v import Model, Softmax, Maxout - from thinc.api import flatten_add_lengths, chain - from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool - from thinc.misc import Residual, LayerNorm - from spacy._ml import logistic, zero_init + from thinc.model import Model + from thinc.layers import Softmax, chain, reduce_mean + from thinc.layers import list2ragged with Model.define_operators({">>": chain}): model = ( tok2vec - >> flatten_add_lengths - >> Pooling(mean_pool) + >> list2ragged() + >> reduce_mean() >> Softmax(nr_class, width) ) model.tok2vec = tok2vec @@ -81,7 +81,7 @@ def build_textcat_model(tok2vec, nr_class, width): def block_gradients(model): - from thinc.api import wrap + from thinc.api import wrap # TODO FIX def forward(X, drop=0.0): Y, _ = model.begin_update(X, drop=drop) @@ -114,7 +114,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter): losses = {} for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): docs = [nlp.make_doc(text) for text in batch] - tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout) + tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout) print(losses) return optimizer @@ -143,8 +143,7 @@ def train_textcat(nlp, n_texts, n_iter=10): # batch up the examples using spaCy's minibatch batches = minibatch(tqdm.tqdm(train_data), size=2) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) + nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 9ece91427..a0455c0a9 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -58,7 +58,7 @@ def main(model_name, unlabelled_loc): # yet, but I'm getting weird results from Adam. Try commenting out the # nlp.update(), and using Adam -- you'll find the models drift apart. # I guess Adam is losing precision, introducing gradient noise? - optimizer.alpha = 0.1 + optimizer.learn_rate = 0.1 optimizer.b1 = 0.0 optimizer.b2 = 0.0 @@ -75,8 +75,7 @@ def main(model_name, unlabelled_loc): # batch up the examples using spaCy's minibatch raw_batches = minibatch(raw_docs, size=4) for batch in minibatch(TRAIN_DATA, size=sizes): - docs, golds = zip(*batch) - nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses) + nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses) raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) print("Losses", losses) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index dd7c3a1b2..9776ad351 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -17,7 +17,7 @@ import plac import random from pathlib import Path -from spacy.symbols import PERSON +import srsly from spacy.vocab import Vocab import spacy @@ -68,7 +68,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) - nlp.vocab.vectors.name = "spacy_pretrained_vectors" + nlp.vocab.vectors.name = "nel_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. @@ -93,7 +93,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. - # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. + # Also ensure that the annotated examples correspond to known identifiers in the knowledge base. kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: @@ -118,16 +118,15 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() + for itn in range(n_iter): random.shuffle(TRAIN_DOCS) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - texts, # batch of texts - annotations, # batch of annotations + batch, drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index d2472b6b9..bfec23d09 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -134,8 +134,7 @@ def main(model=None, output_dir=None, n_iter=15): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 01bb6a67b..d5d034616 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -68,10 +68,8 @@ def main(model=None, output_dir=None, n_iter=100): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - texts, # batch of texts - annotations, # batch of annotations + batch, drop=0.5, # dropout - make it harder to memorise data losses=losses, ) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 72d33ad50..47420e524 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -105,8 +105,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) + nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index c5adb0dec..7bb3e8586 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -75,8 +75,7 @@ def main(model=None, output_dir=None, n_iter=15): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 7136273b3..06e05f6cd 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 456ef098c..4d402e04d 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -10,10 +10,11 @@ see the documentation: Compatible with: spaCy v2.0.0+ """ from __future__ import unicode_literals, print_function + +import ml_datasets import plac import random from pathlib import Path -import thinc.extra.datasets import spacy from spacy.util import minibatch, compounding @@ -83,8 +84,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) + nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) @@ -117,7 +117,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None def load_data(limit=0, split=0.8): """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation - train_data, _ = thinc.extra.datasets.imdb() + train_data, _ = ml_datasets.imdb() random.shuffle(train_data) train_data = train_data[-limit:] texts, labels = zip(*train_data) diff --git a/fabfile.py b/fabfile.py index fcab493f5..760c2c0e2 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals, print_function - import contextlib from pathlib import Path -from fabric.api import local, lcd, env, settings, prefix +from fabric.api import local, lcd from os import path, environ import shutil import sys @@ -82,9 +79,7 @@ def pex(): with virtualenv(VENV_DIR) as venv_local: with lcd(path.dirname(__file__)): sha = local("git rev-parse --short HEAD", capture=True) - venv_local( - "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True - ) + venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True) def clean(): diff --git a/requirements.txt b/requirements.txt index 4f0579313..bb6bf9804 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,21 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0.dev0 +thinc==8.0.0a0 blis>=0.4.0,<0.5.0 +ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 -srsly>=1.0.1,<1.1.0 +srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 -pathlib==1.0.1; python_version < "3.4" tqdm>=4.38.0,<5.0.0 # Optional dependencies jsonschema>=2.6.0,<3.1.0 +pydantic>=1.0.0,<2.0.0 # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index 55396e011..980269c35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,10 +16,7 @@ classifiers = Operating System :: MacOS :: MacOS X Operating System :: Microsoft :: Windows Programming Language :: Cython - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 - Programming Language :: Python :: 3.5 Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 @@ -30,32 +27,35 @@ zip_safe = false include_package_data = true scripts = bin/spacy -python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +python_requires = >=3.6 setup_requires = wheel cython>=0.25 + numpy>=1.15.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0.dev0 + thinc==8.0.0a0 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0.dev0 + thinc==8.0.0a0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 - srsly>=1.0.1,<1.1.0 + srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 + ml_datasets # Third-party dependencies tqdm>=4.38.0,<5.0.0 setuptools numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - pathlib==1.0.1; python_version < "3.4" + pydantic>=1.3.0,<2.0.0 + tqdm>=4.38.0,<5.0.0 [options.extras_require] lookups = diff --git a/setup.py b/setup.py index 1156e7cde..31f22ba3f 100755 --- a/setup.py +++ b/setup.py @@ -1,37 +1,23 @@ #!/usr/bin/env python -from __future__ import print_function -import io -import os -import subprocess import sys -import contextlib from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc import distutils.util from distutils import ccompiler, msvccompiler from setuptools import Extension, setup, find_packages +import numpy +from pathlib import Path +from Cython.Build import cythonize +from Cython.Compiler import Options -def is_new_osx(): - """Check whether we're on OSX >= 10.10""" - name = distutils.util.get_platform() - if sys.platform != "darwin": - return False - elif name.startswith("macosx-10"): - minor_version = int(name.split("-")[1].split(".")[1]) - if minor_version >= 7: - return True - else: - return False - else: - return False +# Preserve `__doc__` on functions and classes +# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options +Options.docstrings = True PACKAGES = find_packages() - - MOD_NAMES = [ - "spacy._align", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", @@ -63,16 +49,32 @@ MOD_NAMES = [ "spacy.symbols", "spacy.vectors", ] - - COMPILE_OPTIONS = { "msvc": ["/Ox", "/EHsc"], "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], } - - LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []} +COMPILER_DIRECTIVES = { + "language_level": -3, + "embedsignature": True, + "annotation_typing": False, +} + + +def is_new_osx(): + """Check whether we're on OSX >= 10.10""" + name = distutils.util.get_platform() + if sys.platform != "darwin": + return False + elif name.startswith("macosx-10"): + minor_version = int(name.split("-")[1].split(".")[1]) + if minor_version >= 7: + return True + else: + return False + else: + return False if is_new_osx(): @@ -105,95 +107,50 @@ class build_ext_subclass(build_ext, build_ext_options): build_ext.build_extensions(self) -def generate_cython(root, source): - print("Cythonizing sources") - p = subprocess.call( - [sys.executable, os.path.join(root, "bin", "cythonize.py"), source], - env=os.environ, - ) - if p != 0: - raise RuntimeError("Running cythonize failed") - - -def is_source_release(path): - return os.path.exists(os.path.join(path, "PKG-INFO")) - - def clean(path): - for name in MOD_NAMES: - name = name.replace(".", "/") - for ext in [".so", ".html", ".cpp", ".c"]: - file_path = os.path.join(path, name + ext) - if os.path.exists(file_path): - os.unlink(file_path) - - -@contextlib.contextmanager -def chdir(new_dir): - old_dir = os.getcwd() - try: - os.chdir(new_dir) - sys.path.insert(0, new_dir) - yield - finally: - del sys.path[0] - os.chdir(old_dir) + for path in path.glob("**/*"): + if path.is_file() and path.suffix in (".so", ".cpp"): + print(f"Deleting {path.name}") + path.unlink() def setup_package(): - root = os.path.abspath(os.path.dirname(__file__)) + root = Path(__file__).parent if len(sys.argv) > 1 and sys.argv[1] == "clean": - return clean(root) + return clean(root / "spacy") - with chdir(root): - with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f: - about = {} - exec(f.read(), about) + with (root / "spacy" / "about.py").open("r") as f: + about = {} + exec(f.read(), about) - include_dirs = [ - get_python_inc(plat_specific=True), - os.path.join(root, "include"), - ] + include_dirs = [ + get_python_inc(plat_specific=True), + numpy.get_include(), + str(root / "include"), + ] + if ( + ccompiler.new_compiler().compiler_type == "msvc" + and msvccompiler.get_build_version() == 9 + ): + include_dirs.append(str(root / "include" / "msvc9")) + ext_modules = [] + for name in MOD_NAMES: + mod_path = name.replace(".", "/") + ".pyx" + ext = Extension(name, [mod_path], language="c++") + ext_modules.append(ext) + print("Cythonizing sources") + ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES) - if ( - ccompiler.new_compiler().compiler_type == "msvc" - and msvccompiler.get_build_version() == 9 - ): - include_dirs.append(os.path.join(root, "include", "msvc9")) - - ext_modules = [] - for mod_name in MOD_NAMES: - mod_path = mod_name.replace(".", "/") + ".cpp" - extra_link_args = [] - # ??? - # Imported from patch from @mikepb - # See Issue #267. Running blind here... - if sys.platform == "darwin": - dylib_path = [".." for _ in range(mod_name.count("."))] - dylib_path = "/".join(dylib_path) - dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path - extra_link_args.append("-Wl,-rpath,%s" % dylib_path) - ext_modules.append( - Extension( - mod_name, - [mod_path], - language="c++", - include_dirs=include_dirs, - extra_link_args=extra_link_args, - ) - ) - - if not is_source_release(root): - generate_cython(root, "spacy") - - setup( - name="spacy", - packages=PACKAGES, - version=about["__version__"], - ext_modules=ext_modules, - cmdclass={"build_ext": build_ext_subclass}, - ) + setup( + name="spacy", + packages=PACKAGES, + version=about["__version__"], + ext_modules=ext_modules, + cmdclass={"build_ext": build_ext_subclass}, + include_dirs=include_dirs, + package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]}, + ) if __name__ == "__main__": diff --git a/spacy/__init__.py b/spacy/__init__.py index 4a0d16a49..2c063ce24 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals import warnings import sys @@ -7,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # These are imported as part of the API -from thinc.neural.util import prefer_gpu, require_gpu +from thinc.api import prefer_gpu, require_gpu from . import pipeline from .cli.info import info as cli_info @@ -23,6 +21,9 @@ if sys.maxunicode == 65535: raise SystemError(Errors.E130) +config = registry + + def load(name, **overrides): depr_path = overrides.get("path") if depr_path not in (True, False, None): diff --git a/spacy/__main__.py b/spacy/__main__.py index 2c285095e..71ab1a91a 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,21 +1,17 @@ -# coding: utf8 -from __future__ import print_function - -# NB! This breaks in plac on Python 2!! -# from __future__ import unicode_literals - if __name__ == "__main__": import plac import sys from wasabi import msg from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data + from spacy.cli import train_from_config_cli commands = { "download": download, "link": link, "info": info, "train": train, + "train-from-config": train_from_config_cli, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, @@ -28,9 +24,9 @@ if __name__ == "__main__": if len(sys.argv) == 1: msg.info("Available commands", ", ".join(commands), exits=1) command = sys.argv.pop(1) - sys.argv[0] = "spacy %s" % command + sys.argv[0] = f"spacy {command}" if command in commands: plac.call(commands[command], sys.argv[1:]) else: - available = "Available: {}".format(", ".join(commands)) - msg.fail("Unknown command: {}".format(command), available, exits=1) + available = f"Available: {', '.join(commands)}" + msg.fail(f"Unknown command: {command}", available, exits=1) diff --git a/spacy/_align.pyx b/spacy/_align.pyx deleted file mode 100644 index 6786ec7ba..000000000 --- a/spacy/_align.pyx +++ /dev/null @@ -1,255 +0,0 @@ -# cython: infer_types=True -'''Do Levenshtein alignment, for evaluation of tokenized input. - -Random notes: - - r i n g - 0 1 2 3 4 -r 1 0 1 2 3 -a 2 1 1 2 3 -n 3 2 2 1 2 -g 4 3 3 2 1 - -0,0: (1,1)=min(0+0,1+1,1+1)=0 S -1,0: (2,1)=min(1+1,0+1,2+1)=1 D -2,0: (3,1)=min(2+1,3+1,1+1)=2 D -3,0: (4,1)=min(3+1,4+1,2+1)=3 D -0,1: (1,2)=min(1+1,2+1,0+1)=1 D -1,1: (2,2)=min(0+1,1+1,1+1)=1 S -2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I -3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I -0,2: (1,3)=min(2+1,3+1,1+1)=2 I -1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I -2,2: (3,3) -3,2: (4,3) -At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?" - -We know the costs to transition: - -S[:i] -> T[:j] (at D[i,j]) -S[:i+1] -> T[:j] (at D[i+1,j]) -S[:i] -> T[:j+1] (at D[i,j+1]) - -Further, now we can transform: -S[:i+1] -> S[:i] (DEL) for 1, -T[:j+1] -> T[:j] (INS) for 1. -S[i+1] -> T[j+1] (SUB) for 0 or 1 - -Therefore we have the costs: -SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j]) -i.e. D[i, j] + S[i+1] != T[j+1] -INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j]) -i.e. D[i+1,j] + 1 -DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) -i.e. D[i,j+1] + 1 - - Source string S has length m, with index i - Target string T has length n, with index j - - Output two alignment vectors: i2j (length m) and j2i (length n) - # function LevenshteinDistance(char s[1..m], char t[1..n]): - # for all i and j, d[i,j] will hold the Levenshtein distance between - # the first i characters of s and the first j characters of t - # note that d has (m+1)*(n+1) values - # set each element in d to zero - ring rang - - r i n g - - 0 0 0 0 0 - r 0 0 0 0 0 - a 0 0 0 0 0 - n 0 0 0 0 0 - g 0 0 0 0 0 - - # source prefixes can be transformed into empty string by - # dropping all characters - # d[i, 0] := i - ring rang - - r i n g - - 0 0 0 0 0 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - - # target prefixes can be reached from empty source prefix - # by inserting every character - # d[0, j] := j - - r i n g - - 0 1 2 3 4 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - -''' -from __future__ import unicode_literals -from libc.stdint cimport uint32_t -import numpy -cimport numpy as np -from .compat import unicode_ -from murmurhash.mrmr cimport hash32 - - -def align(S, T): - cdef int m = len(S) - cdef int n = len(T) - cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') - cdef np.ndarray i2j = numpy.zeros((m,), dtype='i') - cdef np.ndarray j2i = numpy.zeros((n,), dtype='i') - - cdef np.ndarray S_arr = _convert_sequence(S) - cdef np.ndarray T_arr = _convert_sequence(T) - - fill_matrix(matrix.data, - S_arr.data, m, T_arr.data, n) - fill_i2j(i2j, matrix) - fill_j2i(j2i, matrix) - for i in range(i2j.shape[0]): - if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): - i2j[i] = -1 - for j in range(j2i.shape[0]): - if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): - j2i[j] = -1 - return matrix[-1,-1], i2j, j2i, matrix - - -def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): - '''Let's say we had: - - Guess: [aa bb cc dd] - Truth: [aa bbcc dd] - i2j: [0, None, -2, 2] - j2i: [0, -2, 3] - - We want: - - i2j_multi: {1: 1, 2: 1} - j2i_multi: {} - ''' - i2j_miss = _get_regions(i2j, i_lengths) - j2i_miss = _get_regions(j2i, j_lengths) - - i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) - return i2j_multi, j2i_multi - - -def _get_regions(alignment, lengths): - regions = {} - start = None - offset = 0 - for i in range(len(alignment)): - if alignment[i] < 0: - if start is None: - start = offset - regions.setdefault(start, []) - regions[start].append(i) - else: - start = None - offset += lengths[i] - return regions - - -def _get_mapping(miss1, miss2, lengths1, lengths2): - i2j = {} - j2i = {} - for start, region1 in miss1.items(): - if not region1 or start not in miss2: - continue - region2 = miss2[start] - if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2): - j = region2.pop(0) - buff = [] - # Consume tokens from region 1, until we meet the length of the - # first token in region2. If we do, align the tokens. If - # we exceed the length, break. - while region1: - buff.append(region1.pop(0)) - if sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - j += 1 - buff = [] - elif sum(lengths1[i] for i in buff) > lengths2[j]: - break - else: - if buff and sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - return i2j, j2i - - -def _convert_sequence(seq): - if isinstance(seq, numpy.ndarray): - return numpy.ascontiguousarray(seq, dtype='uint32_t') - cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32') - cdef bytes item_bytes - for i, item in enumerate(seq): - if item == "``": - item = '"' - elif item == "''": - item = '"' - if isinstance(item, unicode): - item_bytes = item.encode('utf8') - else: - item_bytes = item - output[i] = hash32(item_bytes, len(item_bytes), 0) - return output - - -cdef void fill_matrix(int* D, - const int* S, int m, const int* T, int n) nogil: - m1 = m+1 - n1 = n+1 - for i in range(m1*n1): - D[i] = 0 - - for i in range(m1): - D[i*n1] = i - - for j in range(n1): - D[j] = j - - cdef int sub_cost, ins_cost, del_cost - for j in range(n): - for i in range(m): - i_j = i*n1 + j - i1_j1 = (i+1)*n1 + j+1 - i1_j = (i+1)*n1 + j - i_j1 = i*n1 + j+1 - if S[i] != T[j]: - sub_cost = D[i_j] + 1 - else: - sub_cost = D[i_j] - del_cost = D[i_j1] + 1 - ins_cost = D[i1_j] + 1 - best = min(min(sub_cost, ins_cost), del_cost) - D[i1_j1] = best - - -cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *: - j = D.shape[1]-2 - cdef int i = D.shape[0]-2 - while i >= 0: - while D[i+1, j] < D[i+1, j+1]: - j -= 1 - if D[i, j+1] < D[i+1, j+1]: - i2j[i] = -1 - else: - i2j[i] = j - j -= 1 - i -= 1 - -cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *: - i = D.shape[0]-2 - cdef int j = D.shape[1]-2 - while j >= 0: - while D[i, j+1] < D[i+1, j+1]: - i -= 1 - if D[i+1, j] < D[i+1, j+1]: - j2i[j] = -1 - else: - j2i[j] = i - i -= 1 - j -= 1 diff --git a/spacy/_ml.py b/spacy/_ml.py index 8695a88cc..e69de29bb 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,985 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import numpy -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu -from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.t2v import Pooling, sum_pool, mean_pool -from thinc.i2v import HashEmbed -from thinc.misc import Residual, FeatureExtracter -from thinc.misc import LayerNorm as LN -from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.api import with_getitem, flatten_add_lengths -from thinc.api import uniqued, wrap, noop -from thinc.linear.linear import LinearModel -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module, copy_array -from thinc.neural.optimizers import Adam - -from thinc import describe -from thinc.describe import Dimension, Synapses, Biases, Gradient -from thinc.neural._classes.affine import _set_dimensions_if_needed -import thinc.extra.load_nlp - -from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE -from .errors import Errors, user_warning, Warnings -from . import util -from . import ml as new_ml -from .ml import _legacy_tok2vec - - -VECTORS_KEY = "spacy_pretrained_vectors" -# Backwards compatibility with <2.2.2 -USE_MODEL_REGISTRY_TOK2VEC = False - - -def cosine(vec1, vec2): - xp = get_array_module(vec1) - norm1 = xp.linalg.norm(vec1) - norm2 = xp.linalg.norm(vec2) - if norm1 == 0.0 or norm2 == 0.0: - return 0 - else: - return vec1.dot(vec2) / (norm1 * norm2) - - -def create_default_optimizer(ops, **cfg): - learn_rate = util.env_opt("learn_rate", 0.001) - beta1 = util.env_opt("optimizer_B1", 0.9) - beta2 = util.env_opt("optimizer_B2", 0.999) - eps = util.env_opt("optimizer_eps", 1e-8) - L2 = util.env_opt("L2_penalty", 1e-6) - max_grad_norm = util.env_opt("grad_norm_clip", 1.0) - optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) - optimizer.max_grad_norm = max_grad_norm - optimizer.device = ops.device - return optimizer - - -@layerize -def _flatten_add_lengths(seqs, pad=0, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=pad) - - X = ops.flatten(seqs, pad=pad) - return (X, lengths), finish_update - - -def _zero_init(model): - def _zero_init_impl(self, *args, **kwargs): - self.W.fill(0) - - model.on_init_hooks.append(_zero_init_impl) - if model.W is not None: - model.W.fill(0.0) - return model - - -def with_cpu(ops, model): - """Wrap a model that should run on CPU, transferring inputs and outputs - as necessary.""" - model.to_cpu() - - def with_cpu_forward(inputs, drop=0.0): - cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop) - gpu_outputs = _to_device(ops, cpu_outputs) - - def with_cpu_backprop(d_outputs, sgd=None): - cpu_d_outputs = _to_cpu(d_outputs) - return backprop(cpu_d_outputs, sgd=sgd) - - return gpu_outputs, with_cpu_backprop - - return wrap(with_cpu_forward, model) - - -def _to_cpu(X): - if isinstance(X, numpy.ndarray): - return X - elif isinstance(X, tuple): - return tuple([_to_cpu(x) for x in X]) - elif isinstance(X, list): - return [_to_cpu(x) for x in X] - elif hasattr(X, "get"): - return X.get() - else: - return X - - -def _to_device(ops, X): - if isinstance(X, tuple): - return tuple([_to_device(ops, x) for x in X]) - elif isinstance(X, list): - return [_to_device(ops, x) for x in X] - else: - return ops.asarray(X) - - -class extract_ngrams(Model): - def __init__(self, ngram_size, attr=LOWER): - Model.__init__(self) - self.ngram_size = ngram_size - self.attr = attr - - def begin_update(self, docs, drop=0.0): - batch_keys = [] - batch_vals = [] - for doc in docs: - unigrams = doc.to_array([self.attr]) - ngrams = [unigrams] - for n in range(2, self.ngram_size + 1): - ngrams.append(self.ops.ngrams(n, unigrams)) - keys = self.ops.xp.concatenate(ngrams) - keys, vals = self.ops.xp.unique(keys, return_counts=True) - batch_keys.append(keys) - batch_vals.append(vals) - # The dtype here matches what thinc is expecting -- which differs per - # platform (by int definition). This should be fixed once the problem - # is fixed on Thinc's side. - lengths = self.ops.asarray( - [arr.shape[0] for arr in batch_keys], dtype=numpy.int_ - ) - batch_keys = self.ops.xp.concatenate(batch_keys) - batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") - return (batch_keys, batch_vals, lengths), None - - -@describe.on_data( - _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model) -) -@describe.attributes( - nI=Dimension("Input size"), - nF=Dimension("Number of features"), - nO=Dimension("Output size"), - nP=Dimension("Maxout pieces"), - W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), - b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)), - pad=Synapses( - "Pad", - lambda obj: (1, obj.nF, obj.nO, obj.nP), - lambda M, ops: ops.normal_init(M, 1.0), - ), - d_W=Gradient("W"), - d_pad=Gradient("pad"), - d_b=Gradient("b"), -) -class PrecomputableAffine(Model): - def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs): - Model.__init__(self, **kwargs) - self.nO = nO - self.nP = nP - self.nI = nI - self.nF = nF - - def begin_update(self, X, drop=0.0): - Yf = self.ops.gemm( - X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True - ) - Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP)) - Yf = self._add_padding(Yf) - - def backward(dY_ids, sgd=None): - dY, ids = dY_ids - dY, ids = self._backprop_padding(dY, ids) - Xf = X[ids] - Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI)) - - self.d_b += dY.sum(axis=0) - dY = dY.reshape((dY.shape[0], self.nO * self.nP)) - - Wopfi = self.W.transpose((1, 2, 0, 3)) - Wopfi = self.ops.xp.ascontiguousarray(Wopfi) - Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI)) - dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi) - - # Reuse the buffer - dWopfi = Wopfi - dWopfi.fill(0.0) - self.ops.gemm(dY, Xf, out=dWopfi, trans1=True) - dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI)) - # (o, p, f, i) --> (f, o, p, i) - self.d_W += dWopfi.transpose((2, 0, 1, 3)) - - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf.reshape((dXf.shape[0], self.nF, self.nI)) - - return Yf, backward - - def _add_padding(self, Yf): - Yf_padded = self.ops.xp.vstack((self.pad, Yf)) - return Yf_padded - - def _backprop_padding(self, dY, ids): - # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0 - mask = ids < 0.0 - mask = mask.sum(axis=1) - d_pad = dY * mask.reshape((ids.shape[0], 1, 1)) - self.d_pad += d_pad.sum(axis=0) - return dY, ids - - @staticmethod - def init_weights(model): - """This is like the 'layer sequential unit variance', but instead - of taking the actual inputs, we randomly generate whitened data. - - Why's this all so complicated? We have a huge number of inputs, - and the maxout unit makes guessing the dynamics tricky. Instead - we set the maxout weights to values that empirically result in - whitened outputs given whitened inputs. - """ - if (model.W ** 2).sum() != 0.0: - return - ops = model.ops - xp = ops.xp - ops.normal_init(model.W, model.nF * model.nI, inplace=True) - - ids = ops.allocate((5000, model.nF), dtype="f") - ids += xp.random.uniform(0, 1000, ids.shape) - ids = ops.asarray(ids, dtype="i") - tokvecs = ops.allocate((5000, model.nI), dtype="f") - tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( - tokvecs.shape - ) - - def predict(ids, tokvecs): - # nS ids. nW tokvecs. Exclude the padding array. - hiddens = model(tokvecs[:-1]) # (nW, f, o, p) - vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f") - # need nS vectors - hiddens = hiddens.reshape( - (hiddens.shape[0] * model.nF, model.nO * model.nP) - ) - model.ops.scatter_add(vectors, ids.flatten(), hiddens) - vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP)) - vectors += model.b - vectors = model.ops.asarray(vectors) - if model.nP >= 2: - return model.ops.maxout(vectors)[0] - else: - return vectors * (vectors >= 0) - - tol_var = 0.01 - tol_mean = 0.01 - t_max = 10 - t_i = 0 - for t_i in range(t_max): - acts1 = predict(ids, tokvecs) - var = model.ops.xp.var(acts1) - mean = model.ops.xp.mean(acts1) - if abs(var - 1.0) >= tol_var: - model.W /= model.ops.xp.sqrt(var) - elif abs(mean) >= tol_mean: - model.b -= mean - else: - break - - -def link_vectors_to_models(vocab): - vectors = vocab.vectors - if vectors.name is None: - vectors.name = VECTORS_KEY - if vectors.data.size != 0: - user_warning(Warnings.W020.format(shape=vectors.data.shape)) - ops = Model.ops - for word in vocab: - if word.orth in vectors.key2row: - word.rank = vectors.key2row[word.orth] - else: - word.rank = 0 - data = ops.asarray(vectors.data) - # Set an entry here, so that vectors are accessed by StaticVectors - # (unideal, I know) - key = (ops.device, vectors.name) - if key in thinc.extra.load_nlp.VECTORS: - if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: - # This is a hack to avoid the problem in #3853. Maybe we should - # print a warning as well? - old_name = vectors.name - new_name = vectors.name + "_%d" % data.shape[0] - user_warning(Warnings.W019.format(old=old_name, new=new_name)) - vectors.name = new_name - key = (ops.device, vectors.name) - thinc.extra.load_nlp.VECTORS[key] = data - - -def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): - import torch.nn - from thinc.api import with_square_sequences - from thinc.extra.wrappers import PyTorchWrapperRNN - - if depth == 0: - return layerize(noop()) - model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout) - return with_square_sequences(PyTorchWrapperRNN(model)) - - -def Tok2Vec(width, embed_size, **kwargs): - if not USE_MODEL_REGISTRY_TOK2VEC: - # Preserve prior tok2vec for backwards compat, in v2.2.2 - return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs) - pretrained_vectors = kwargs.get("pretrained_vectors", None) - cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) - subword_features = kwargs.get("subword_features", True) - char_embed = kwargs.get("char_embed", False) - conv_depth = kwargs.get("conv_depth", 4) - bilstm_depth = kwargs.get("bilstm_depth", 0) - conv_window = kwargs.get("conv_window", 1) - - cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] - - doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}} - if char_embed: - embed_cfg = { - "arch": "spacy.CharacterEmbed.v1", - "config": { - "width": 64, - "chars": 6, - "@mix": { - "arch": "spacy.LayerNormalizedMaxout.v1", - "config": {"width": width, "pieces": 3}, - }, - "@embed_features": None, - }, - } - else: - embed_cfg = { - "arch": "spacy.MultiHashEmbed.v1", - "config": { - "width": width, - "rows": embed_size, - "columns": cols, - "use_subwords": subword_features, - "@pretrained_vectors": None, - "@mix": { - "arch": "spacy.LayerNormalizedMaxout.v1", - "config": {"width": width, "pieces": 3}, - }, - }, - } - if pretrained_vectors: - embed_cfg["config"]["@pretrained_vectors"] = { - "arch": "spacy.PretrainedVectors.v1", - "config": { - "vectors_name": pretrained_vectors, - "width": width, - "column": cols.index("ID"), - }, - } - if cnn_maxout_pieces >= 2: - cnn_cfg = { - "arch": "spacy.MaxoutWindowEncoder.v1", - "config": { - "width": width, - "window_size": conv_window, - "pieces": cnn_maxout_pieces, - "depth": conv_depth, - }, - } - else: - cnn_cfg = { - "arch": "spacy.MishWindowEncoder.v1", - "config": {"width": width, "window_size": conv_window, "depth": conv_depth}, - } - bilstm_cfg = { - "arch": "spacy.TorchBiLSTMEncoder.v1", - "config": {"width": width, "depth": bilstm_depth}, - } - if conv_depth == 0 and bilstm_depth == 0: - encode_cfg = {} - elif conv_depth >= 1 and bilstm_depth >= 1: - encode_cfg = { - "arch": "thinc.FeedForward.v1", - "config": {"children": [cnn_cfg, bilstm_cfg]}, - } - elif conv_depth >= 1: - encode_cfg = cnn_cfg - else: - encode_cfg = bilstm_cfg - config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg} - return new_ml.Tok2Vec(config) - - -def reapply(layer, n_times): - def reapply_fwd(X, drop=0.0): - backprops = [] - for i in range(n_times): - Y, backprop = layer.begin_update(X, drop=drop) - X = Y - backprops.append(backprop) - - def reapply_bwd(dY, sgd=None): - dX = None - for backprop in reversed(backprops): - dY = backprop(dY, sgd=sgd) - if dX is None: - dX = dY - else: - dX += dY - return dX - - return Y, reapply_bwd - - return wrap(reapply_fwd, layer) - - -def asarray(ops, dtype): - def forward(X, drop=0.0): - return ops.asarray(X, dtype=dtype), None - - return layerize(forward) - - -def _divide_array(X, size): - parts = [] - index = 0 - while index < len(X): - parts.append(X[index : index + size]) - index += size - return parts - - -def get_col(idx): - if idx < 0: - raise IndexError(Errors.E066.format(value=idx)) - - def forward(X, drop=0.0): - if isinstance(X, numpy.ndarray): - ops = NumpyOps() - else: - ops = CupyOps() - output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) - - def backward(y, sgd=None): - dX = ops.allocate(X.shape) - dX[:, idx] += y - return dX - - return output, backward - - return layerize(forward) - - -def doc2feats(cols=None): - if cols is None: - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - - def forward(docs, drop=0.0): - feats = [] - for doc in docs: - feats.append(doc.to_array(cols)) - return feats, None - - model = layerize(forward) - model.cols = cols - return model - - -def print_shape(prefix): - def forward(X, drop=0.0): - return X, lambda dX, **kwargs: dX - - return layerize(forward) - - -@layerize -def get_token_vectors(tokens_attrs_vectors, drop=0.0): - tokens, attrs, vectors = tokens_attrs_vectors - - def backward(d_output, sgd=None): - return (tokens, d_output) - - return vectors, backward - - -@layerize -def logistic(X, drop=0.0): - xp = get_array_module(X) - if not isinstance(X, xp.ndarray): - X = xp.asarray(X) - # Clip to range (-10, 10) - X = xp.minimum(X, 10.0, X) - X = xp.maximum(X, -10.0, X) - Y = 1.0 / (1.0 + xp.exp(-X)) - - def logistic_bwd(dY, sgd=None): - dX = dY * (Y * (1 - Y)) - return dX - - return Y, logistic_bwd - - -def zero_init(model): - def _zero_init_impl(self, X, y): - self.W.fill(0) - - model.on_data_hooks.append(_zero_init_impl) - return model - - -def getitem(i): - def getitem_fwd(X, drop=0.0): - return X[i], None - - return layerize(getitem_fwd) - - -@describe.attributes( - W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None) -) -class MultiSoftmax(Affine): - """Neural network layer that predicts several multi-class attributes at once. - For instance, we might predict one class with 6 variables, and another with 5. - We predict the 11 neurons required for this, and then softmax them such - that columns 0-6 make a probability distribution and coumns 6-11 make another. - """ - - name = "multisoftmax" - - def __init__(self, out_sizes, nI=None, **kwargs): - Model.__init__(self, **kwargs) - self.out_sizes = out_sizes - self.nO = sum(out_sizes) - self.nI = nI - - def predict(self, input__BI): - output__BO = self.ops.affine(self.W, self.b, input__BI) - i = 0 - for out_size in self.out_sizes: - self.ops.softmax(output__BO[:, i : i + out_size], inplace=True) - i += out_size - return output__BO - - def begin_update(self, input__BI, drop=0.0): - output__BO = self.predict(input__BI) - - def finish_update(grad__BO, sgd=None): - self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True) - self.d_b += grad__BO.sum(axis=0) - grad__BI = self.ops.gemm(grad__BO, self.W) - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return grad__BI - - return output__BO, finish_update - - -def build_tagger_model(nr_class, **cfg): - embed_size = util.env_opt("embed_size", 2000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 96) - pretrained_vectors = cfg.get("pretrained_vectors") - subword_features = cfg.get("subword_features", True) - with Model.define_operators({">>": chain, "+": add}): - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - tok2vec = Tok2Vec( - token_vector_width, - embed_size, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors, - ) - softmax = with_flatten(Softmax(nr_class, token_vector_width)) - model = tok2vec >> softmax - model.nI = None - model.tok2vec = tok2vec - model.softmax = softmax - return model - - -def build_morphologizer_model(class_nums, **cfg): - embed_size = util.env_opt("embed_size", 7000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 128) - pretrained_vectors = cfg.get("pretrained_vectors") - char_embed = cfg.get("char_embed", True) - with Model.define_operators({">>": chain, "+": add, "**": clone}): - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - tok2vec = Tok2Vec( - token_vector_width, - embed_size, - char_embed=char_embed, - pretrained_vectors=pretrained_vectors, - ) - softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) - softmax.out_sizes = class_nums - model = tok2vec >> softmax - model.nI = None - model.tok2vec = tok2vec - model.softmax = softmax - return model - - -@layerize -def SpacyVectors(docs, drop=0.0): - batch = [] - for doc in docs: - indices = numpy.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in doc.vocab.vectors.key2row: - indices[i] = doc.vocab.vectors.key2row[word.orth] - else: - indices[i] = 0 - vectors = doc.vocab.vectors.data[indices] - batch.append(vectors) - return batch, None - - -def build_text_classifier(nr_class, width=64, **cfg): - depth = cfg.get("depth", 2) - nr_vector = cfg.get("nr_vector", 5000) - pretrained_dims = cfg.get("pretrained_dims", 0) - with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}): - if cfg.get("low_data") and pretrained_dims: - model = ( - SpacyVectors - >> flatten_add_lengths - >> with_getitem(0, Affine(width, pretrained_dims)) - >> ParametricAttention(width) - >> Pooling(sum_pool) - >> Residual(ReLu(width, width)) ** 2 - >> zero_init(Affine(nr_class, width, drop_factor=0.0)) - >> logistic - ) - return model - - lower = HashEmbed(width, nr_vector, column=1) - prefix = HashEmbed(width // 2, nr_vector, column=2) - suffix = HashEmbed(width // 2, nr_vector, column=3) - shape = HashEmbed(width // 2, nr_vector, column=4) - - trained_vectors = FeatureExtracter( - [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] - ) >> with_flatten( - uniqued( - (lower | prefix | suffix | shape) - >> LN(Maxout(width, width + (width // 2) * 3)), - column=0, - ) - ) - - if pretrained_dims: - static_vectors = SpacyVectors >> with_flatten( - Affine(width, pretrained_dims) - ) - # TODO Make concatenate support lists - vectors = concatenate_lists(trained_vectors, static_vectors) - vectors_width = width * 2 - else: - vectors = trained_vectors - vectors_width = width - static_vectors = None - tok2vec = vectors >> with_flatten( - LN(Maxout(width, vectors_width)) - >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth, - pad=depth, - ) - cnn_model = ( - tok2vec - >> flatten_add_lengths - >> ParametricAttention(width) - >> Pooling(sum_pool) - >> Residual(zero_init(Maxout(width, width))) - >> zero_init(Affine(nr_class, width, drop_factor=0.0)) - ) - - linear_model = build_bow_text_classifier( - nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False - ) - if cfg.get("exclusive_classes"): - output_layer = Softmax(nr_class, nr_class * 2) - else: - output_layer = ( - zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic - ) - model = (linear_model | cnn_model) >> output_layer - model.tok2vec = chain(tok2vec, flatten) - model.nO = nr_class - model.lsuv = False - return model - - -def build_bow_text_classifier( - nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg -): - with Model.define_operators({">>": chain}): - model = with_cpu( - Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class) - ) - if not no_output_layer: - model = model >> (cpu_softmax if exclusive_classes else logistic) - model.nO = nr_class - return model - - -@layerize -def cpu_softmax(X, drop=0.0): - ops = NumpyOps() - - def cpu_softmax_backward(dY, sgd=None): - return dY - - return ops.softmax(X), cpu_softmax_backward - - -def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg): - """ - Build a simple CNN text classifier, given a token-to-vector model as inputs. - If exclusive_classes=True, a softmax non-linearity is applied, so that the - outputs sum to 1. If exclusive_classes=False, a logistic non-linearity - is applied instead, so that outputs are in the range [0, 1]. - """ - with Model.define_operators({">>": chain}): - if exclusive_classes: - output_layer = Softmax(nr_class, tok2vec.nO) - else: - output_layer = ( - zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic - ) - model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer - model.tok2vec = chain(tok2vec, flatten) - model.nO = nr_class - return model - - -def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): - if "entity_width" not in cfg: - raise ValueError(Errors.E144.format(param="entity_width")) - - conv_depth = cfg.get("conv_depth", 2) - cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) - pretrained_vectors = cfg.get("pretrained_vectors", None) - context_width = cfg.get("entity_width") - - with Model.define_operators({">>": chain, "**": clone}): - # context encoder - tok2vec = Tok2Vec( - width=hidden_width, - embed_size=embed_width, - pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, - subword_features=True, - conv_depth=conv_depth, - bilstm_depth=0, - ) - - model = ( - tok2vec - >> flatten_add_lengths - >> Pooling(mean_pool) - >> Residual(zero_init(Maxout(hidden_width, hidden_width))) - >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0)) - ) - - model.tok2vec = tok2vec - model.nO = context_width - return model - - -@layerize -def flatten(seqs, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=0) - - X = ops.flatten(seqs, pad=0) - return X, finish_update - - -def concatenate_lists(*layers, **kwargs): # pragma: no cover - """Compose two or more models `f`, `g`, etc, such that their outputs are - concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - """ - if not layers: - return noop() - drop_factor = kwargs.get("drop_factor", 1.0) - ops = layers[0].ops - layers = [chain(layer, flatten) for layer in layers] - concat = concatenate(*layers) - - def concatenate_lists_fwd(Xs, drop=0.0): - if drop is not None: - drop *= drop_factor - lengths = ops.asarray([len(X) for X in Xs], dtype="i") - flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) - ys = ops.unflatten(flat_y, lengths) - - def concatenate_lists_bwd(d_ys, sgd=None): - return bp_flat_y(ops.flatten(d_ys), sgd=sgd) - - return ys, concatenate_lists_bwd - - model = wrap(concatenate_lists_fwd, concat) - return model - - -def masked_language_model(vocab, model, mask_prob=0.15): - """Convert a model into a BERT-style masked language model""" - - random_words = _RandomWords(vocab) - - def mlm_forward(docs, drop=0.0): - mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) - mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.begin_update(docs, drop=drop) - - def mlm_backward(d_output, sgd=None): - d_output *= 1 - mask - return backprop(d_output, sgd=sgd) - - return output, mlm_backward - - return wrap(mlm_forward, model) - - -class _RandomWords(object): - def __init__(self, vocab): - self.words = [lex.text for lex in vocab if lex.prob != 0.0] - self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] - self.words = self.words[:10000] - self.probs = self.probs[:10000] - self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) - self.probs /= self.probs.sum() - self._cache = [] - - def next(self): - if not self._cache: - self._cache.extend( - numpy.random.choice(len(self.words), 10000, p=self.probs) - ) - index = self._cache.pop() - return self.words[index] - - -def _apply_mask(docs, random_words, mask_prob=0.15): - # This needs to be here to avoid circular imports - from .tokens.doc import Doc - - N = sum(len(doc) for doc in docs) - mask = numpy.random.uniform(0.0, 1.0, (N,)) - mask = mask >= mask_prob - i = 0 - masked_docs = [] - for doc in docs: - words = [] - for token in doc: - if not mask[i]: - word = _replace_word(token.text, random_words) - else: - word = token.text - words.append(word) - i += 1 - spaces = [bool(w.whitespace_) for w in doc] - # NB: If you change this implementation to instead modify - # the docs in place, take care that the IDs reflect the original - # words. Currently we use the original docs to make the vectors - # for the target, so we don't lose the original tokens. But if - # you modified the docs in place here, you would. - masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) - return mask, masked_docs - - -def _replace_word(word, random_words, mask="[MASK]"): - roll = numpy.random.random() - if roll < 0.8: - return mask - elif roll < 0.9: - return random_words.next() - else: - return word - - -def _uniform_init(lo, hi): - def wrapped(W, ops): - copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) - - return wrapped - - -@describe.attributes( - nM=Dimension("Vector dimensions"), - nC=Dimension("Number of characters per word"), - vectors=Synapses( - "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1) - ), - d_vectors=Gradient("vectors"), -) -class CharacterEmbed(Model): - def __init__(self, nM=None, nC=None, **kwargs): - Model.__init__(self, **kwargs) - self.nM = nM - self.nC = nC - - @property - def nO(self): - return self.nM * self.nC - - @property - def nV(self): - return 256 - - def begin_update(self, docs, drop=0.0): - if not docs: - return [] - ids = [] - output = [] - weights = self.vectors - # This assists in indexing; it's like looping over this dimension. - # Still consider this weird witch craft...But thanks to Mark Neumann - # for the tip. - nCv = self.ops.xp.arange(self.nC) - for doc in docs: - doc_ids = doc.to_utf8_array(nr_char=self.nC) - doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM)) - # Let's say I have a 2d array of indices, and a 3d table of data. What numpy - # incantation do I chant to get - # output[i, j, k] == data[j, ids[i, j], k]? - doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]] - output.append(doc_vectors.reshape((len(doc), self.nO))) - ids.append(doc_ids) - - def backprop_character_embed(d_vectors, sgd=None): - gradient = self.d_vectors - for doc_ids, d_doc_vectors in zip(ids, d_vectors): - d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM)) - gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return None - - return output, backprop_character_embed - - -def get_cossim_loss(yh, y, ignore_zeros=False): - xp = get_array_module(yh) - # Find the zero vectors - if ignore_zeros: - zero_indices = xp.abs(y).sum(axis=1) == 0 - # Add a small constant to avoid 0 vectors - yh = yh + 1e-8 - y = y + 1e-8 - # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity - norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) - norm_y = xp.linalg.norm(y, axis=1, keepdims=True) - mul_norms = norm_yh * norm_y - cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms - d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) - losses = xp.abs(cosine - 1) - if ignore_zeros: - # If the target was a zero vector, don't count it in the loss. - d_yh[zero_indices] = 0 - losses[zero_indices] = 0 - loss = losses.sum() - return loss, -d_yh diff --git a/spacy/about.py b/spacy/about.py index a1880fb54..6a3c680ab 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.3" +__version__ = "3.0.0.dev3" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/analysis.py b/spacy/analysis.py index 761be3de9..ed6d6b18e 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import OrderedDict from wasabi import Printer from .tokens import Doc, Token, Span @@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): assert pipeline[index][0] == name prev_pipes = pipeline[:index] pipe_requires = getattr(pipe, "requires", []) - requires = OrderedDict([(annot, False) for annot in pipe_requires]) + requires = {annot: False for annot in pipe_requires} if requires: for prev_name, prev_pipe in prev_pipes: prev_assigns = getattr(prev_pipe, "assigns", []) @@ -98,15 +94,15 @@ def validate_attrs(values): for ext_attr, ext_value in value.items(): # We don't check whether the attribute actually exists if ext_value is not True: # attr is something like doc._.x.y - good = "{}._.{}".format(obj_key, ext_attr) - bad = "{}.{}".format(good, ".".join(ext_value)) + good = f"{obj_key}._.{ext_attr}" + bad = f"{good}.{'.'.join(ext_value)}" raise ValueError(Errors.E183.format(attr=bad, solution=good)) continue # we can't validate those further if attr.endswith("_"): # attr is something like "token.pos_" raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1])) if value is not True: # attr is something like doc.x.y - good = "{}.{}".format(obj_key, attr) - bad = "{}.{}".format(good, ".".join(value)) + good = f"{obj_key}.{attr}" + bad = f"{good}.{'.'.join(value)}" raise ValueError(Errors.E183.format(attr=bad, solution=good)) obj = objs[obj_key] if not hasattr(obj, attr): @@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False): msg.table(overview, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in problems.values()) if any(p for p in problems.values()): - msg.divider("Problems ({})".format(n_problems)) + msg.divider(f"Problems ({n_problems})") for name, problem in problems.items(): if problem: - problem = ", ".join(problem) - msg.warn("'{}' requirements not met: {}".format(name, problem)) + msg.warn(f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.") if no_print: diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 4cff4a415..8e31b9b1b 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -91,4 +91,5 @@ cdef enum attr_id_t: LANG ENT_KB_ID = symbols.ENT_KB_ID + MORPH ENT_ID = symbols.ENT_ID diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 51eb5c35b..40b86fc80 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - IDS = { "": NULL_ATTR, @@ -91,6 +88,7 @@ IDS = { "SPACY": SPACY, "PROB": PROB, "LANG": LANG, + "MORPH": MORPH, } diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 778453711..5f83b26c1 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,12 +1,21 @@ +from wasabi import msg + from .download import download # noqa: F401 from .info import info # noqa: F401 -from .link import link # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 from .train import train # noqa: F401 +from .train_from_config import train_from_config_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 + + +def link(*args, **kwargs): + msg.warn( + "As of spaCy v3.0, model symlinks are deprecated. You can load models " + "using their full names or from a directory path." + ) diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py deleted file mode 100644 index 3fb2c8979..000000000 --- a/spacy/cli/_schemas.py +++ /dev/null @@ -1,220 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -# NB: This schema describes the new format of the training data, see #2928 -TRAINING_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "title": "Training data for spaCy models", - "type": "array", - "items": { - "type": "object", - "properties": { - "text": { - "title": "The text of the training example", - "type": "string", - "minLength": 1, - }, - "ents": { - "title": "Named entity spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0, - }, - "label": { - "title": "Entity label", - "type": "string", - "minLength": 1, - "pattern": "^[A-Z0-9]*$", - }, - }, - "required": ["start", "end", "label"], - }, - }, - "sents": { - "title": "Sentence spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0, - }, - }, - "required": ["start", "end"], - }, - }, - "cats": { - "title": "Text categories for the text classifier", - "type": "object", - "patternProperties": { - "*": { - "title": "A text category", - "oneOf": [ - {"type": "boolean"}, - {"type": "number", "minimum": 0}, - ], - } - }, - "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1}, - }, - "tokens": { - "title": "The tokens in the text", - "type": "array", - "items": { - "type": "object", - "minProperties": 1, - "properties": { - "id": { - "title": "Token ID, usually token index", - "type": "integer", - "minimum": 0, - }, - "start": { - "title": "Start character offset of the token", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the token", - "type": "integer", - "minimum": 0, - }, - "pos": { - "title": "Coarse-grained part-of-speech tag", - "type": "string", - "minLength": 1, - }, - "tag": { - "title": "Fine-grained part-of-speech tag", - "type": "string", - "minLength": 1, - }, - "dep": { - "title": "Dependency label", - "type": "string", - "minLength": 1, - }, - "head": { - "title": "Index of the token's head", - "type": "integer", - "minimum": 0, - }, - }, - "required": ["start", "end"], - }, - }, - "_": {"title": "Custom user space", "type": "object"}, - }, - "required": ["text"], - }, -} - -META_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "type": "object", - "properties": { - "lang": { - "title": "Two-letter language code, e.g. 'en'", - "type": "string", - "minLength": 2, - "maxLength": 2, - "pattern": "^[a-z]*$", - }, - "name": { - "title": "Model name", - "type": "string", - "minLength": 1, - "pattern": "^[a-z_]*$", - }, - "version": { - "title": "Model version", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-]*$", - }, - "spacy_version": { - "title": "Compatible spaCy version identifier", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-><=]*$", - }, - "parent_package": { - "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", - "type": "string", - "minLength": 1, - "default": "spacy", - }, - "pipeline": { - "title": "Names of pipeline components", - "type": "array", - "items": {"type": "string", "minLength": 1}, - }, - "description": {"title": "Model description", "type": "string"}, - "license": {"title": "Model license", "type": "string"}, - "author": {"title": "Model author name", "type": "string"}, - "email": {"title": "Model author email", "type": "string", "format": "email"}, - "url": {"title": "Model author URL", "type": "string", "format": "uri"}, - "sources": { - "title": "Training data sources", - "type": "array", - "items": {"type": "string"}, - }, - "vectors": { - "title": "Included word vectors", - "type": "object", - "properties": { - "keys": { - "title": "Number of unique keys", - "type": "integer", - "minimum": 0, - }, - "vectors": { - "title": "Number of unique vectors", - "type": "integer", - "minimum": 0, - }, - "width": { - "title": "Number of dimensions", - "type": "integer", - "minimum": 0, - }, - }, - }, - "accuracy": { - "title": "Accuracy numbers", - "type": "object", - "patternProperties": {"*": {"type": "number", "minimum": 0.0}}, - }, - "speed": { - "title": "Speed evaluation numbers", - "type": "object", - "patternProperties": { - "*": { - "oneOf": [ - {"type": "number", "minimum": 0.0}, - {"type": "integer", "minimum": 0}, - ] - } - }, - }, - }, - "required": ["lang", "name", "version"], -} diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index fa867fa04..2ffbeb458 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac from pathlib import Path from wasabi import Printer import srsly @@ -29,27 +25,20 @@ FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES_STDOUT = ("json", "jsonl") -@plac.annotations( - input_file=("Input file", "positional", None, str), - output_dir=("Output directory. '-' for stdout.", "positional", None, str), - file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str), - n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int), - seg_sents=("Segment sentences (for -c ner)", "flag", "s"), - model=("Model for sentence segmentation (for -s)", "option", "b", str), - converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), - lang=("Language (if tokenizer required)", "option", "l", str), - morphology=("Enable appending morphology to tags", "flag", "m", bool), -) def convert( - input_file, - output_dir="-", - file_type="json", - n_sents=1, - seg_sents=False, - model=None, - morphology=False, - converter="auto", - lang=None, + # fmt: off + input_file: ("Input file", "positional", None, str), + output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", + file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", + n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, + seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, + model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, + morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, + merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, + converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", + ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, + lang: ("Language (if tokenizer required)", "option", "l", str) = None, + # fmt: on ): """ Convert files into JSON format for use with train command and other @@ -60,16 +49,10 @@ def convert( no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) - if file_type not in FILE_TYPES: - msg.fail( - "Unknown file type: '{}'".format(file_type), - "Supported file types: '{}'".format(", ".join(FILE_TYPES)), - exits=1, - ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( - "Can't write .{} data to stdout.".format(file_type), + f"Can't write .{file_type} data to stdout", "Please specify an output directory.", exits=1, ) @@ -93,21 +76,26 @@ def convert( "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: - msg.fail("Can't find converter for {}".format(converter), exits=1) + msg.fail(f"Can't find converter for {converter}", exits=1) + ner_map = None + if ner_map_path is not None: + ner_map = srsly.read_json(ner_map_path) # Use converter function to convert data func = CONVERTERS[converter] data = func( input_data, n_sents=n_sents, seg_sents=seg_sents, - use_morphology=morphology, + append_morphology=morphology, + merge_subtokens=merge_subtokens, lang=lang, model=model, no_print=no_print, + ner_map=ner_map, ) if output_dir != "-": # Export data to a file - suffix = ".{}".format(file_type) + suffix = f".{file_type}" output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) @@ -115,9 +103,7 @@ def convert( srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) - msg.good( - "Generated output file ({} documents): {}".format(len(data), output_file) - ) + msg.good(f"Generated output file ({len(data)} documents): {output_file}") else: # Print to stdout if file_type == "json": diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index 46489ad7c..b607d5913 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from wasabi import Printer from ...gold import iob_to_biluo @@ -64,9 +61,9 @@ def conll_ner2json( # sentence segmentation required for document segmentation if n_sents > 0 and not seg_sents: msg.warn( - "No sentence boundaries found to use with option `-n {}`. " - "Use `-s` to automatically segment sentences or `-n 0` " - "to disable.".format(n_sents) + f"No sentence boundaries found to use with option `-n {n_sents}`. " + f"Use `-s` to automatically segment sentences or `-n 0` " + f"to disable." ) else: n_sents_info(msg, n_sents) @@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): if model: nlp = load_model(model) if "parser" in nlp.pipe_names: - msg.info("Segmenting sentences with parser from model '{}'.".format(model)) + msg.info(f"Segmenting sentences with parser from model '{model}'.") sentencizer = nlp.get_pipe("parser") if not sentencizer: msg.info( @@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter): def n_sents_info(msg, n_sents): - msg.info("Grouping every {} sentences into a document.".format(n_sents)) + msg.info(f"Grouping every {n_sents} sentences into a document.") if n_sents == 1: msg.warn( "To generate better training data, you may want to group " diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 3de4dcc30..ecdc2ae66 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -1,141 +1,348 @@ -# coding: utf8 -from __future__ import unicode_literals - import re -from ...gold import iob_to_biluo +from ...gold import Example +from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets +from ...language import Language +from ...tokens import Doc, Token +from .conll_ner2json import n_sents_info +from wasabi import Printer -def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): +def conllu2json( + input_data, + n_sents=10, + append_morphology=False, + lang=None, + ner_map=None, + merge_subtokens=False, + no_print=False, + **_ +): """ Convert conllu files into JSON format for use with train cli. - use_morphology parameter enables appending morphology to tags, which is + append_morphology parameter enables appending morphology to tags, which is useful for languages such as Spanish, where UD tags are not so rich. Extract NER tags if available and convert them so that they follow BILUO and the Wikipedia scheme """ - # by @dvsrepo, via #11 explosion/spacy-dev-resources - # by @katarkor + MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" + msg = Printer(no_print=no_print) + n_sents_info(msg, n_sents) docs = [] + raw = "" sentences = [] - conll_tuples = read_conllx(input_data, use_morphology=use_morphology) - checked_for_ner = False - has_ner_tags = False - for i, (raw_text, tokens) in enumerate(conll_tuples): - sentence, brackets = tokens[0] - if not checked_for_ner: - has_ner_tags = is_ner(sentence[5][0]) - checked_for_ner = True - sentences.append(generate_sentence(sentence, has_ner_tags)) + conll_data = read_conllx( + input_data, + append_morphology=append_morphology, + ner_tag_pattern=MISC_NER_PATTERN, + ner_map=ner_map, + merge_subtokens=merge_subtokens, + ) + has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) + for i, example in enumerate(conll_data): + raw += example.text + sentences.append( + generate_sentence( + example.token_annotation, + has_ner_tags, + MISC_NER_PATTERN, + ner_map=ner_map, + ) + ) # Real-sized documents could be extracted using the comments on the - # conluu document + # conllu document if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) + doc = create_json_doc(raw, sentences, i) docs.append(doc) + raw = "" sentences = [] if sentences: - doc = create_doc(sentences, i) + doc = create_json_doc(raw, sentences, i) docs.append(doc) return docs -def is_ner(tag): +def has_ner(input_data, ner_tag_pattern): """ Check the 10th column of the first token to determine if the file contains NER tags """ - tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) - if tag_match: - return True - elif tag == "O": - return True - else: - return False - - -def read_conllx(input_data, use_morphology=False, n=0): - i = 0 for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - tokens = [] - for line in lines: - - parts = line.split("\t") - id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts - if "-" in id_ or "." in id_: - continue - try: - id_ = int(id_) - 1 - head = (int(head) - 1) if head not in ["0", "_"] else id_ - dep = "ROOT" if dep == "root" else dep - tag = pos if tag == "_" else tag - tag = tag + "__" + morph if use_morphology else tag - iob = iob if iob else "O" - tokens.append((id_, word, tag, head, dep, iob)) - except: # noqa: E722 - print(line) - raise - tuples = [list(t) for t in zip(*tokens)] - yield (None, [[tuples, []]]) - i += 1 - if n >= 1 and i >= n: - break + if lines: + parts = lines[0].split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if re.search(ner_tag_pattern, misc): + return True + else: + return False -def simplify_tags(iob): +def read_conllx( + input_data, + append_morphology=False, + merge_subtokens=False, + ner_tag_pattern="", + ner_map=None, +): + """ Yield examples, one for each sentence """ + vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc + for sent in input_data.strip().split("\n\n"): + lines = sent.strip().split("\n") + if lines: + while lines[0].startswith("#"): + lines.pop(0) + example = example_from_conllu_sentence( + vocab, + lines, + ner_tag_pattern, + merge_subtokens=merge_subtokens, + append_morphology=append_morphology, + ner_map=ner_map, + ) + yield example + + +def get_entities(lines, tag_pattern, ner_map=None): + """Find entities in the MISC column according to the pattern and map to + final entity type with `ner_map` if mapping present. Entity tag is 'O' if + the pattern is not matched. + + lines (unicode): CONLL-U lines for one sentences + tag_pattern (unicode): Regex pattern for entity tag + ner_map (dict): Map old NER tag names to new ones, '' maps to O. + RETURNS (list): List of BILUO entity tags """ - Simplify tags obtained from the dataset in order to follow Wikipedia - scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while - 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to - 'MISC'. - """ - new_iob = [] - for tag in iob: - tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) + miscs = [] + for line in lines: + parts = line.split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if "-" in id_ or "." in id_: + continue + miscs.append(misc) + + iob = [] + for misc in miscs: + tag_match = re.search(tag_pattern, misc) + iob_tag = "O" if tag_match: - prefix = tag_match.group(1) - suffix = tag_match.group(2) - if suffix == "GPE_LOC": - suffix = "LOC" - elif suffix == "GPE_ORG": - suffix = "ORG" - elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": - suffix = "MISC" - tag = prefix + "-" + suffix - new_iob.append(tag) - return new_iob + prefix = tag_match.group(2) + suffix = tag_match.group(3) + if prefix and suffix: + iob_tag = prefix + "-" + suffix + if ner_map: + suffix = ner_map.get(suffix, suffix) + if suffix == "": + iob_tag = "O" + else: + iob_tag = prefix + "-" + suffix + iob.append(iob_tag) + return iob_to_biluo(iob) -def generate_sentence(sent, has_ner_tags): - (id_, word, tag, head, dep, iob) = sent +def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None): sentence = {} tokens = [] - if has_ner_tags: - iob = simplify_tags(iob) - biluo = iob_to_biluo(iob) - for i, id in enumerate(id_): + for i, id_ in enumerate(token_annotation.ids): token = {} - token["id"] = id - token["orth"] = word[i] - token["tag"] = tag[i] - token["head"] = head[i] - id - token["dep"] = dep[i] + token["id"] = id_ + token["orth"] = token_annotation.get_word(i) + token["tag"] = token_annotation.get_tag(i) + token["pos"] = token_annotation.get_pos(i) + token["lemma"] = token_annotation.get_lemma(i) + token["morph"] = token_annotation.get_morph(i) + token["head"] = token_annotation.get_head(i) - id_ + token["dep"] = token_annotation.get_dep(i) if has_ner_tags: - token["ner"] = biluo[i] + token["ner"] = token_annotation.get_entity(i) tokens.append(token) sentence["tokens"] = tokens return sentence -def create_doc(sentences, id): +def create_json_doc(raw, sentences, id_): doc = {} paragraph = {} - doc["id"] = id + doc["id"] = id_ doc["paragraphs"] = [] + paragraph["raw"] = raw.strip() paragraph["sentences"] = sentences doc["paragraphs"].append(paragraph) return doc + + +def example_from_conllu_sentence( + vocab, + lines, + ner_tag_pattern, + merge_subtokens=False, + append_morphology=False, + ner_map=None, +): + """Create an Example from the lines for one CoNLL-U sentence, merging + subtokens and appending morphology to tags if required. + + lines (unicode): The non-comment lines for a CoNLL-U sentence + ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col + RETURNS (Example): An example containing the annotation + """ + # create a Doc with each subtoken as its own token + # if merging subtokens, each subtoken orth is the merged subtoken form + if not Token.has_extension("merged_orth"): + Token.set_extension("merged_orth", default="") + if not Token.has_extension("merged_lemma"): + Token.set_extension("merged_lemma", default="") + if not Token.has_extension("merged_morph"): + Token.set_extension("merged_morph", default="") + if not Token.has_extension("merged_spaceafter"): + Token.set_extension("merged_spaceafter", default="") + words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], [] + heads, deps = [], [] + subtok_word = "" + in_subtok = False + for i in range(len(lines)): + line = lines[i] + parts = line.split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if "." in id_: + continue + if "-" in id_: + in_subtok = True + if "-" in id_: + in_subtok = True + subtok_word = word + subtok_start, subtok_end = id_.split("-") + subtok_spaceafter = "SpaceAfter=No" not in misc + continue + if merge_subtokens and in_subtok: + words.append(subtok_word) + else: + words.append(word) + if in_subtok: + if id_ == subtok_end: + spaces.append(subtok_spaceafter) + else: + spaces.append(False) + elif "SpaceAfter=No" in misc: + spaces.append(False) + else: + spaces.append(True) + if in_subtok and id_ == subtok_end: + subtok_word = "" + in_subtok = False + id_ = int(id_) - 1 + head = (int(head) - 1) if head not in ("0", "_") else id_ + tag = pos if tag == "_" else tag + morph = morph if morph != "_" else "" + dep = "ROOT" if dep == "root" else dep + lemmas.append(lemma) + poses.append(pos) + tags.append(tag) + morphs.append(morph) + heads.append(head) + deps.append(dep) + + doc = Doc(vocab, words=words, spaces=spaces) + for i in range(len(doc)): + doc[i].tag_ = tags[i] + doc[i].pos_ = poses[i] + doc[i].dep_ = deps[i] + doc[i].lemma_ = lemmas[i] + doc[i].head = doc[heads[i]] + doc[i]._.merged_orth = words[i] + doc[i]._.merged_morph = morphs[i] + doc[i]._.merged_lemma = lemmas[i] + doc[i]._.merged_spaceafter = spaces[i] + ents = get_entities(lines, ner_tag_pattern, ner_map) + doc.ents = spans_from_biluo_tags(doc, ents) + doc.is_parsed = True + doc.is_tagged = True + + if merge_subtokens: + doc = merge_conllu_subtokens(lines, doc) + + # create Example from custom Doc annotation + ids, words, tags, heads, deps = [], [], [], [], [] + pos, lemmas, morphs, spaces = [], [], [], [] + for i, t in enumerate(doc): + ids.append(i) + words.append(t._.merged_orth) + if append_morphology and t._.merged_morph: + tags.append(t.tag_ + "__" + t._.merged_morph) + else: + tags.append(t.tag_) + pos.append(t.pos_) + morphs.append(t._.merged_morph) + lemmas.append(t._.merged_lemma) + heads.append(t.head.i) + deps.append(t.dep_) + spaces.append(t._.merged_spaceafter) + ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + ents = biluo_tags_from_offsets(doc, ent_offsets) + raw = "" + for word, space in zip(words, spaces): + raw += word + if space: + raw += " " + example = Example(doc=raw) + example.set_token_annotation( + ids=ids, + words=words, + tags=tags, + pos=pos, + morphs=morphs, + lemmas=lemmas, + heads=heads, + deps=deps, + entities=ents, + ) + return example + + +def merge_conllu_subtokens(lines, doc): + # identify and process all subtoken spans to prepare attrs for merging + subtok_spans = [] + for line in lines: + parts = line.split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if "-" in id_: + subtok_start, subtok_end = id_.split("-") + subtok_span = doc[int(subtok_start) - 1 : int(subtok_end)] + subtok_spans.append(subtok_span) + # create merged tag, morph, and lemma values + tags = [] + morphs = {} + lemmas = [] + for token in subtok_span: + tags.append(token.tag_) + lemmas.append(token.lemma_) + if token._.merged_morph: + for feature in token._.merged_morph.split("|"): + field, values = feature.split("=", 1) + if field not in morphs: + morphs[field] = set() + for value in values.split(","): + morphs[field].add(value) + # create merged features for each morph field + for field, values in morphs.items(): + morphs[field] = field + "=" + ",".join(sorted(values)) + # set the same attrs on all subtok tokens so that whatever head the + # retokenizer chooses, the final attrs are available on that token + for token in subtok_span: + token._.merged_orth = token.orth_ + token._.merged_lemma = " ".join(lemmas) + token.tag_ = "_".join(tags) + token._.merged_morph = "|".join(sorted(morphs.values())) + token._.merged_spaceafter = ( + True if subtok_span[-1].whitespace_ else False + ) + + with doc.retokenize() as retokenizer: + for span in subtok_spans: + retokenizer.merge(span) + + return doc diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 61c398f8d..b6ac234fc 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from wasabi import Printer from ...gold import iob_to_biluo diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 1c1bc45c7..525063b22 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import srsly from ...gold import docs_to_json diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4b12052c3..1705bf446 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - from pathlib import Path from collections import Counter -import plac import sys import srsly from wasabi import Printer, MESSAGES @@ -22,30 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 -@plac.annotations( - lang=("model language", "positional", None, str), - train_path=("location of JSON-formatted training data", "positional", None, Path), - dev_path=("location of JSON-formatted development data", "positional", None, Path), - base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), - ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), - verbose=("Print additional information and explanations", "flag", "V", bool), - no_format=("Don't pretty-print the results", "flag", "NF", bool), -) def debug_data( - lang, - train_path, - dev_path, - base_model=None, - pipeline="tagger,parser,ner", - ignore_warnings=False, - verbose=False, - no_format=False, + # fmt: off + lang: ("Model language", "positional", None, str), + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + base_model: ("Name of model to update (optional)", "option", "b", str) = None, + pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner", + ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False, + verbose: ("Print additional information and explanations", "flag", "V", bool) = False, + no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False, + # fmt: on ): """ Analyze, debug and validate your training and development data, get useful @@ -60,6 +44,10 @@ def debug_data( if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: @@ -67,6 +55,8 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) msg.divider("Data format validation") @@ -80,20 +70,16 @@ def debug_data( with msg.loading("Loading corpus..."): corpus = GoldCorpus(train_path, dev_path) try: - train_docs = list(corpus.train_docs(nlp)) - train_docs_unpreprocessed = list( - corpus.train_docs_without_preprocessing(nlp) + train_dataset = list(corpus.train_dataset(nlp)) + train_dataset_unpreprocessed = list( + corpus.train_dataset_without_preprocessing(nlp) ) except ValueError as e: - loading_train_error_message = "Training data cannot be loaded: {}".format( - str(e) - ) + loading_train_error_message = f"Training data cannot be loaded: {e}" try: - dev_docs = list(corpus.dev_docs(nlp)) + dev_dataset = list(corpus.dev_dataset(nlp)) except ValueError as e: - loading_dev_error_message = "Development data cannot be loaded: {}".format( - str(e) - ) + loading_dev_error_message = f"Development data cannot be loaded: {e}" if loading_train_error_message or loading_dev_error_message: if loading_train_error_message: msg.fail(loading_train_error_message) @@ -102,80 +88,68 @@ def debug_data( sys.exit(1) msg.good("Corpus is loadable") - # Create all gold data here to avoid iterating over the train_docs constantly - gold_train_data = _compile_gold(train_docs, pipeline) - gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) - gold_dev_data = _compile_gold(dev_docs, pipeline) + # Create all gold data here to avoid iterating over the train_dataset constantly + gold_train_data = _compile_gold(train_dataset, pipeline) + gold_train_unpreprocessed_data = _compile_gold( + train_dataset_unpreprocessed, pipeline + ) + gold_dev_data = _compile_gold(dev_dataset, pipeline) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] msg.divider("Training stats") - msg.text("Training pipeline: {}".format(", ".join(pipeline))) + msg.text(f"Training pipeline: {', '.join(pipeline)}") for pipe in [p for p in pipeline if p not in nlp.factories]: - msg.fail("Pipeline component '{}' not available in factories".format(pipe)) + msg.fail(f"Pipeline component '{pipe}' not available in factories") if base_model: - msg.text("Starting with base model '{}'".format(base_model)) + msg.text(f"Starting with base model '{base_model}'") else: - msg.text("Starting with blank model '{}'".format(lang)) - msg.text("{} training docs".format(len(train_docs))) - msg.text("{} evaluation docs".format(len(dev_docs))) + msg.text(f"Starting with blank model '{lang}'") + msg.text(f"{len(train_dataset)} training docs") + msg.text(f"{len(dev_dataset)} evaluation docs") - if not len(dev_docs): + if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: - msg.warn("{} training examples also in evaluation data".format(overlap)) + msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") - if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: - text = "Low number of examples to train from a blank model ({})".format( - len(train_docs) + if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD: + text = ( + f"Low number of examples to train from a blank model ({len(train_dataset)})" ) - if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: + if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( - "It's recommended to use at least {} examples (minimum {})".format( - BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD - ), + f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " + f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( - "{} total {} in the data ({} unique)".format( - n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]) - ) + f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: - msg.warn( - "{} misaligned tokens in the training data".format( - gold_train_data["n_misaligned_words"] - ) - ) + n_misaligned = gold_train_data["n_misaligned_words"] + msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: - msg.warn( - "{} misaligned tokens in the dev data".format( - gold_dev_data["n_misaligned_words"] - ) - ) + n_misaligned = gold_dev_data["n_misaligned_words"] + msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( - "10 most common words: {}".format( - _format_labels(most_common_words, counts=True) - ), + f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( - "{} vectors ({} unique keys, {} dimensions)".format( - len(nlp.vocab.vectors), - nlp.vocab.vectors.n_keys, - nlp.vocab.vectors_length, - ) + f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " + f"unique keys, {nlp.vocab.vectors_length} dimensions)" ) else: msg.info("No word vectors present in the model") @@ -183,7 +157,7 @@ def debug_data( if "ner" in pipeline: # Get all unique NER labels present in the data labels = set( - label for label in gold_train_data["ner"] if label not in ("O", "-") + label for label in gold_train_data["ner"] if label not in ("O", "-", None) ) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") @@ -196,19 +170,10 @@ def debug_data( msg.divider("Named Entity Recognition") msg.info( - "{} new {}, {} existing {}".format( - len(new_labels), - "label" if len(new_labels) == 1 else "labels", - len(existing_labels), - "label" if len(existing_labels) == 1 else "labels", - ) + f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)" ) missing_values = label_counts["-"] - msg.text( - "{} missing {} (tokens with '-' label)".format( - missing_values, "value" if missing_values == 1 else "values" - ) - ) + msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in new_labels: if len(label) == 0: msg.fail("Empty label found in new labels") @@ -219,39 +184,28 @@ def debug_data( if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) - msg.text("New: {}".format(labels_with_counts), show=verbose) + msg.text(f"New: {labels_with_counts}", show=verbose) if existing_labels: - msg.text( - "Existing: {}".format(_format_labels(existing_labels)), show=verbose - ) - + msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if gold_train_data["ws_ents"]: - msg.fail( - "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"]) - ) + msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") has_ws_ents_error = True if gold_train_data["punct_ents"]: - msg.warn( - "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"]) - ) + msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation") has_punct_ents_warning = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( - "Low number of examples for new label '{}' ({})".format( - label, label_counts[label] - ) + f"Low number of examples for new label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): - neg_docs = _get_examples_without_label(train_docs, label) + neg_docs = _get_examples_without_label(train_dataset, label) if neg_docs == 0: - msg.warn( - "No examples for texts WITHOUT new label '{}'".format(label) - ) + msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if not has_low_data_warning: @@ -265,8 +219,8 @@ def debug_data( if has_low_data_warning: msg.text( - "To train a new entity type, your data should include at " - "least {} instances of the new label".format(NEW_LABEL_THRESHOLD), + f"To train a new entity type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: @@ -295,27 +249,21 @@ def debug_data( new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info( - "Text Classification: {} new label(s), {} existing label(s)".format( - len(new_labels), len(existing_labels) - ) + f"Text Classification: {len(new_labels)} new label(s), " + f"{len(existing_labels)} existing label(s)" ) if new_labels: labels_with_counts = _format_labels( gold_train_data["cats"].most_common(), counts=True ) - msg.text("New: {}".format(labels_with_counts), show=verbose) + msg.text(f"New: {labels_with_counts}", show=verbose) if existing_labels: - msg.text( - "Existing: {}".format(_format_labels(existing_labels)), show=verbose - ) + msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.fail( - "The train and dev labels are not the same. " - "Train labels: {}. " - "Dev labels: {}.".format( - _format_labels(gold_train_data["cats"]), - _format_labels(gold_dev_data["cats"]), - ) + f"The train and dev labels are not the same. " + f"Train labels: {_format_labels(gold_train_data['cats'])}. " + f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if gold_train_data["n_cats_multilabel"] > 0: msg.info( @@ -344,28 +292,17 @@ def debug_data( if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] - tag_map = nlp.Defaults.tag_map - msg.info( - "{} {} in data ({} {} in tag map)".format( - len(labels), - "label" if len(labels) == 1 else "labels", - len(tag_map), - "label" if len(tag_map) == 1 else "labels", - ) - ) + tag_map = nlp.vocab.morphology.tag_map + msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: - msg.good("All labels present in tag map for language '{}'".format(nlp.lang)) + msg.good(f"All labels present in tag map for language '{nlp.lang}'") for label in non_tagmap: - msg.fail( - "Label '{}' not found in tag map for language '{}'".format( - label, nlp.lang - ) - ) + msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'") if "parser" in pipeline: has_low_data_warning = False @@ -373,21 +310,18 @@ def debug_data( # profile sentence length msg.info( - "Found {} sentence{} with an average length of {:.1f} words.".format( - gold_train_data["n_sents"], - "s" if len(train_docs) > 1 else "", - gold_train_data["n_words"] / gold_train_data["n_sents"], - ) + f"Found {gold_train_data['n_sents']} sentence(s) with an average " + f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( - "The training data contains {:.2f} sentences per " - "document. When there are very few documents containing more " - "than one sentence, the parser will not learn how to segment " - "longer texts into sentences.".format(sents_per_doc) + f"The training data contains {sents_per_doc:.2f} sentences per " + f"document. When there are very few documents containing more " + f"than one sentence, the parser will not learn how to segment " + f"longer texts into sentences." ) # profile labels @@ -398,32 +332,13 @@ def debug_data( labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: - msg.info( - "Found {} nonprojective train sentence{}".format( - gold_train_unpreprocessed_data["n_nonproj"], - "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "", - ) - ) + n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] + msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: - msg.info( - "Found {} nonprojective dev sentence{}".format( - gold_dev_data["n_nonproj"], - "s" if gold_dev_data["n_nonproj"] > 1 else "", - ) - ) - - msg.info( - "{} {} in train data".format( - len(labels_train_unpreprocessed), - "label" if len(labels_train) == 1 else "labels", - ) - ) - msg.info( - "{} {} in projectivized train data".format( - len(labels_train), "label" if len(labels_train) == 1 else "labels" - ) - ) - + n_nonproj = gold_dev_data["n_nonproj"] + msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") + msg.info(f"{labels_train_unpreprocessed} label(s) in train data") + msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True ) @@ -433,9 +348,8 @@ def debug_data( for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD: msg.warn( - "Low number of examples for label '{}' ({})".format( - label, gold_train_unpreprocessed_data["deps"][label] - ) + f"Low number of examples for label '{label}' " + f"({gold_train_unpreprocessed_data['deps'][label]})" ) has_low_data_warning = True @@ -444,22 +358,19 @@ def debug_data( for label in gold_train_data["deps"]: if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label: rare_projectivized_labels.append( - "{}: {}".format(label, str(gold_train_data["deps"][label])) + f"{label}: {gold_train_data['deps'][label]}" ) if len(rare_projectivized_labels) > 0: msg.warn( - "Low number of examples for {} label{} in the " - "projectivized dependency trees used for training. You may " - "want to projectivize labels such as punct before " - "training in order to improve parser performance.".format( - len(rare_projectivized_labels), - "s" if len(rare_projectivized_labels) > 1 else "", - ) + f"Low number of examples for {len(rare_projectivized_labels)} " + "label(s) in the projectivized dependency trees used for " + "training. You may want to projectivize labels such as punct " + "before training in order to improve parser performance." ) msg.warn( - "Projectivized labels with low numbers of examples: " - "{}".format("\n".join(rare_projectivized_labels)), + f"Projectivized labels with low numbers of examples: ", + ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True @@ -467,50 +378,44 @@ def debug_data( # labels only in train if set(labels_train) - set(labels_dev): msg.warn( - "The following labels were found only in the train data: " - "{}".format(", ".join(set(labels_train) - set(labels_dev))), + "The following labels were found only in the train data:", + ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( - "The following labels were found only in the dev data: " - + ", ".join(set(labels_dev) - set(labels_train)), + "The following labels were found only in the dev data:", + ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( - "To train a parser, your data should include at " - "least {} instances of each label.".format(DEP_LABEL_THRESHOLD), + f"To train a parser, your data should include at " + f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( - "Multiple root labels ({}) ".format( - ", ".join(gold_train_unpreprocessed_data["roots"]) - ) - + "found in training data. spaCy's parser uses a single root " - "label ROOT so this distinction will not be available." + f"Multiple root labels " + f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " + f"found in training data. spaCy's parser uses a single root " + f"label ROOT so this distinction will not be available." ) # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( - "Found {} nonprojective projectivized train sentence{}".format( - gold_train_data["n_nonproj"], - "s" if gold_train_data["n_nonproj"] > 1 else "", - ) + f"Found {gold_train_data['n_nonproj']} nonprojective " + f"projectivized train sentence(s)" ) if gold_train_data["n_cycles"] > 0: msg.fail( - "Found {} projectivized train sentence{} with cycles".format( - gold_train_data["n_cycles"], - "s" if gold_train_data["n_cycles"] > 1 else "", - ) + f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") @@ -518,42 +423,34 @@ def debug_data( warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: - msg.good( - "{} {} passed".format( - good_counts, "check" if good_counts == 1 else "checks" - ) - ) + msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed") if warn_counts: - msg.warn( - "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings") - ) - if fail_counts: - msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) - + msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: + msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1) def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": - with msg.loading("Loading {}...".format(file_name)): + with msg.loading(f"Loading {file_name}..."): data = srsly.read_json(file_path) - msg.good("Loaded {}".format(file_name)) + msg.good(f"Loaded {file_name}") return data elif file_path.suffix == ".jsonl": - with msg.loading("Loading {}...".format(file_name)): + with msg.loading(f"Loading {file_name}..."): data = srsly.read_jsonl(file_path) - msg.good("Loaded {}".format(file_name)) + msg.good(f"Loaded {file_name}") return data msg.fail( - "Can't load file extension {}".format(file_path.suffix), + f"Can't load file extension {file_path.suffix}", "Expected .json or .jsonl", exits=1, ) -def _compile_gold(train_docs, pipeline): +def _compile_gold(examples, pipeline): data = { "ner": Counter(), "cats": Counter(), @@ -571,7 +468,9 @@ def _compile_gold(train_docs, pipeline): "n_cats_multilabel": 0, "texts": set(), } - for doc, gold in train_docs: + for example in examples: + gold = example.gold + doc = example.doc valid_words = [x for x in gold.words if x is not None] data["words"].update(valid_words) data["n_words"] += len(valid_words) @@ -584,7 +483,13 @@ def _compile_gold(train_docs, pipeline): if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 - if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]: + if label.startswith(("B-", "U-", "L-")) and doc[i].text in [ + ".", + "'", + "!", + "?", + ",", + ]: # punctuation entity: could be replaced by whitespace when training with noise, # so add a warning to alert the user to this unexpected side effect. data["punct_ents"] += 1 @@ -614,14 +519,18 @@ def _compile_gold(train_docs, pipeline): def _format_labels(labels, counts=False): if counts: - return ", ".join(["'{}' ({})".format(l, c) for l, c in labels]) - return ", ".join(["'{}'".format(l) for l in labels]) + return ", ".join([f"'{l}' ({c})" for l, c in labels]) + return ", ".join([f"'{l}'" for l in labels]) def _get_examples_without_label(data, label): count = 0 - for doc, gold in data: - labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")] + for ex in data: + labels = [ + label.split("-")[1] + for label in ex.gold.ner + if label not in ("O", "-", None) + ] if label not in labels: count += 1 return count diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 19f3e7860..0230e272d 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,28 +1,21 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac import requests import os import subprocess import sys from wasabi import msg -from .link import link -from ..util import get_package_path from .. import about -@plac.annotations( - model=("Model to download (shortcut or name)", "positional", None, str), - direct=("Force direct download of name + version", "flag", "d", bool), - pip_args=("Additional arguments to be passed to `pip install` on model install"), -) -def download(model, direct=False, *pip_args): +def download( + model: ("Model to download (shortcut or name)", "positional", None, str), + direct: ("Force direct download of name + version", "flag", "d", bool) = False, + *pip_args: ("Additional arguments to be passed to `pip install` on model install"), +): """ - Download compatible model from default download path using pip. Model - can be shortcut, model name or, if --direct flag is set, full model name - with version. For direct downloads, the compatibility check will be skipped. + Download compatible model from default download path using pip. If --direct + flag is set, the command expects the full model name with version. + For direct downloads, the compatibility check will be skipped. """ if not require_package("spacy") and "--no-deps" not in pip_args: msg.warn( @@ -50,30 +43,8 @@ def download(model, direct=False, *pip_args): sys.exit(dl) msg.good( "Download and installation successful", - "You can now load the model via spacy.load('{}')".format(model_name), + f"You can now load the model via spacy.load('{model_name}')", ) - # Only create symlink if the model is installed via a shortcut like 'en'. - # There's no real advantage over an additional symlink for en_core_web_sm - # and if anything, it's more error prone and causes more confusion. - if model in shortcuts: - try: - # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a - # package, which fails if model was just installed via - # subprocess - package_path = get_package_path(model_name) - link(model_name, model, force=True, model_path=package_path) - except: # noqa: E722 - # Dirty, but since spacy.download and the auto-linking is - # mostly a convenience wrapper, it's best to show a success - # message and loading instructions, even if linking fails. - msg.warn( - "Download successful but linking failed", - "Creating a shortcut link for '{}' didn't work (maybe you " - "don't have admin permissions?), but you can still load " - "the model via its full package name: " - "nlp = spacy.load('{}')".format(model, model_name), - ) # If a model is downloaded and then loaded within the same process, our # is_package check currently fails, because pkg_resources.working_set # is not refreshed automatically (see #3923). We're trying to work @@ -95,11 +66,11 @@ def get_json(url, desc): r = requests.get(url) if r.status_code != 200: msg.fail( - "Server error ({})".format(r.status_code), - "Couldn't fetch {}. Please find a model for your spaCy " - "installation (v{}), and download it manually. For more " - "details, see the documentation: " - "https://spacy.io/usage/models".format(desc, about.__version__), + f"Server error ({r.status_code})", + f"Couldn't fetch {desc}. Please find a model for your spaCy " + f"installation (v{about.__version__}), and download it manually. " + f"For more details, see the documentation: " + f"https://spacy.io/usage/models", exits=1, ) return r.json() @@ -111,7 +82,7 @@ def get_compatibility(): comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: - msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1) + msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) return comp[version] @@ -119,8 +90,7 @@ def get_version(model, comp): model = model.rsplit(".dev", 1)[0] if model not in comp: msg.fail( - "No compatible model found for '{}' " - "(spaCy v{}).".format(model, about.__version__), + f"No compatible model found for '{model}' (spaCy v{about.__version__})", exits=1, ) return comp[model][0] diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index c24e37038..e047f1283 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - -import plac from timeit import default_timer as timer from wasabi import msg @@ -10,23 +6,16 @@ from .. import util from .. import displacy -@plac.annotations( - model=("Model name or path", "positional", None, str), - data_path=("Location of JSON-formatted evaluation data", "positional", None, str), - gold_preproc=("Use gold preprocessing", "flag", "G", bool), - gpu_id=("Use GPU", "option", "g", int), - displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), - displacy_limit=("Limit of parses to render as HTML", "option", "dl", int), - return_scores=("Return dict containing model scores", "flag", "R", bool), -) def evaluate( - model, - data_path, - gpu_id=-1, - gold_preproc=False, - displacy_path=None, - displacy_limit=25, - return_scores=False, + # fmt: off + model: ("Model name or path", "positional", None, str), + data_path: ("Location of JSON-formatted evaluation data", "positional", None, str), + gpu_id: ("Use GPU", "option", "g", int) = -1, + gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, + displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None, + displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25, + return_scores: ("Return dict containing model scores", "flag", "R", bool) = False, + # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an @@ -44,28 +33,31 @@ def evaluate( msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) - dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) + dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) begin = timer() - scorer = nlp.evaluate(dev_docs, verbose=False) + scorer = nlp.evaluate(dev_dataset, verbose=False) end = timer() - nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + nwords = sum(len(ex.doc) for ex in dev_dataset) results = { - "Time": "%.2f s" % (end - begin), + "Time": f"{end - begin:.2f} s", "Words": nwords, - "Words/s": "%.0f" % (nwords / (end - begin)), - "TOK": "%.2f" % scorer.token_acc, - "POS": "%.2f" % scorer.tags_acc, - "UAS": "%.2f" % scorer.uas, - "LAS": "%.2f" % scorer.las, - "NER P": "%.2f" % scorer.ents_p, - "NER R": "%.2f" % scorer.ents_r, - "NER F": "%.2f" % scorer.ents_f, - "Textcat": "%.2f" % scorer.textcat_score, + "Words/s": f"{nwords / (end - begin):.0f}", + "TOK": f"{scorer.token_acc:.2f}", + "POS": f"{scorer.tags_acc:.2f}", + "UAS": f"{scorer.uas:.2f}", + "LAS": f"{scorer.las:.2f}", + "NER P": f"{scorer.ents_p:.2f}", + "NER R": f"{scorer.ents_r:.2f}", + "NER F": f"{scorer.ents_f:.2f}", + "Textcat": f"{scorer.textcat_score:.2f}", + "Sent P": f"{scorer.sent_p:.2f}", + "Sent R": f"{scorer.sent_r:.2f}", + "Sent F": f"{scorer.sent_f:.2f}", } msg.table(results, title="Results") if displacy_path: - docs, golds = zip(*dev_docs) + docs = [ex.doc for ex in dev_dataset] render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( @@ -76,7 +68,7 @@ def evaluate( deps=render_deps, ents=render_ents, ) - msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) + msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) if return_scores: return scorer.scores diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 080d0dc77..23f766368 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,44 +1,39 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac import platform from pathlib import Path from wasabi import msg import srsly -from ..compat import path2str, basestring_, unicode_ +from .validate import get_model_pkgs from .. import util from .. import about -@plac.annotations( - model=("Optional shortcut link of model", "positional", None, str), - markdown=("Generate Markdown for GitHub issues", "flag", "md", str), - silent=("Don't print anything (just return)", "flag", "s"), -) -def info(model=None, markdown=False, silent=False): +def info( + model: ("Optional model name", "positional", None, str) = None, + markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, + silent: ("Don't print anything (just return)", "flag", "s") = False, +): """ - Print info about spaCy installation. If a model shortcut link is - speficied as an argument, print model information. Flag --markdown - prints details in Markdown for easy copy-pasting to GitHub issues. + Print info about spaCy installation. If a model is speficied as an argument, + print model information. Flag --markdown prints details in Markdown for easy + copy-pasting to GitHub issues. """ if model: if util.is_package(model): model_path = util.get_package_path(model) else: - model_path = util.get_data_path() / model + model_path = model meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: - meta["link"] = path2str(model_path) - meta["source"] = path2str(model_path.resolve()) + meta["link"] = str(model_path) + meta["source"] = str(model_path.resolve()) else: - meta["source"] = path2str(model_path) + meta["source"] = str(model_path) if not silent: - title = "Info about model '{}'".format(model) + title = f"Info about model '{model}'" model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } @@ -47,12 +42,13 @@ def info(model=None, markdown=False, silent=False): else: msg.table(model_meta, title=title) return meta + all_models, _ = get_model_pkgs() data = { "spaCy version": about.__version__, - "Location": path2str(Path(__file__).parent.parent), + "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": list_models(), + "Models": ", ".join(model["name"] for model in all_models.values()), } if not silent: title = "Info about spaCy" @@ -63,19 +59,6 @@ def info(model=None, markdown=False, silent=False): return data -def list_models(): - def exclude_dir(dir_name): - # exclude common cache directories and hidden directories - exclude = ("cache", "pycache", "__pycache__") - return dir_name in exclude or dir_name.startswith(".") - - data_path = util.get_data_path() - if data_path: - models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] - return ", ".join([m for m in models if not exclude_dir(m)]) - return "-" - - def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. @@ -84,9 +67,9 @@ def print_markdown(data, title=None): """ markdown = [] for key, value in data.items(): - if isinstance(value, basestring_) and Path(value).exists(): + if isinstance(value, str) and Path(value).exists(): continue - markdown.append("* **{}:** {}".format(key, unicode_(value))) + markdown.append(f"* **{key}:** {value}") if title: - print("\n## {}".format(title)) + print(f"\n## {title}") print("\n{}\n".format("\n".join(markdown))) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3fa0cc890..babef106c 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac import math from tqdm import tqdm import numpy @@ -27,32 +23,18 @@ except ImportError: DEFAULT_OOV_PROB = -20 -@plac.annotations( - lang=("Model language", "positional", None, str), - output_dir=("Model output directory", "positional", None, Path), - freqs_loc=("Location of words frequencies file", "option", "f", Path), - jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path), - clusters_loc=("Optional location of brown clusters data", "option", "c", str), - vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), - prune_vectors=("Optional number of vectors to prune to", "option", "V", int), - vectors_name=( - "Optional name for the word vectors, e.g. en_core_web_lg.vectors", - "option", - "vn", - str, - ), - model_name=("Optional name for the model meta", "option", "mn", str), -) def init_model( - lang, - output_dir, - freqs_loc=None, - clusters_loc=None, - jsonl_loc=None, - vectors_loc=None, - prune_vectors=-1, - vectors_name=None, - model_name=None, + # fmt: off + lang: ("Model language", "positional", None, str), + output_dir: ("Model output directory", "positional", None, Path), + freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None, + clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None, + jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None, + vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None, + prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1, + vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None, + model_name: ("Optional name for the model meta", "option", "mn", str) = None, + # fmt: on ): """ Create a new model from raw data, like word frequencies, Brown clusters @@ -91,8 +73,7 @@ def init_model( vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( - "Sucessfully compiled vocab", - "{} entries, {} vectors".format(lex_added, vec_added), + "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", ) if not output_dir.exists(): output_dir.mkdir() @@ -177,9 +158,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: - with msg.loading("Reading vectors from {}".format(vectors_loc)): + with msg.loading(f"Reading vectors from {vectors_loc}"): vectors_data, vector_keys = read_vectors(vectors_loc) - msg.good("Loaded vectors from {}".format(vectors_loc)) + msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) if vector_keys is not None: @@ -190,7 +171,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if name is None: - nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] + nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors" else: nlp.vocab.vectors.name = name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name @@ -236,7 +217,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): word = literal_eval(key) except SyntaxError: # Take odd strings literally. - word = literal_eval("'%s'" % key) + word = literal_eval(f"'{key}'") smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total diff --git a/spacy/cli/link.py b/spacy/cli/link.py deleted file mode 100644 index 8117829b5..000000000 --- a/spacy/cli/link.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac -from pathlib import Path -from wasabi import msg - -from ..compat import symlink_to, path2str -from .. import util - - -@plac.annotations( - origin=("package name or local path to model", "positional", None, str), - link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool), -) -def link(origin, link_name, force=False, model_path=None): - """ - Create a symlink for models within the spacy/data directory. Accepts - either the name of a pip package, or the local path to the model data - directory. Linking models allows loading them via spacy.load(link_name). - """ - if util.is_package(origin): - model_path = util.get_package_path(origin) - else: - model_path = Path(origin) if model_path is None else Path(model_path) - if not model_path.exists(): - msg.fail( - "Can't locate model data", - "The data should be located in {}".format(path2str(model_path)), - exits=1, - ) - data_path = util.get_data_path() - if not data_path or not data_path.exists(): - spacy_loc = Path(__file__).parent.parent - msg.fail( - "Can't find the spaCy data path to create model symlink", - "Make sure a directory `/data` exists within your spaCy " - "installation and try again. The data directory should be located " - "here:".format(path=spacy_loc), - exits=1, - ) - link_path = util.get_data_path() / link_name - if link_path.is_symlink() and not force: - msg.fail( - "Link '{}' already exists".format(link_name), - "To overwrite an existing link, use the --force flag", - exits=1, - ) - elif link_path.is_symlink(): # does a symlink exist? - # NB: It's important to check for is_symlink here and not for exists, - # because invalid/outdated symlinks would return False otherwise. - link_path.unlink() - elif link_path.exists(): # does it exist otherwise? - # NB: Check this last because valid symlinks also "exist". - msg.fail( - "Can't overwrite symlink '{}'".format(link_name), - "This can happen if your data directory contains a directory or " - "file of the same name.", - exits=1, - ) - details = "%s --> %s" % (path2str(model_path), path2str(link_path)) - try: - symlink_to(link_path, model_path) - except: # noqa: E722 - # This is quite dirty, but just making sure other errors are caught. - msg.fail( - "Couldn't link model to '{}'".format(link_name), - "Creating a symlink in spacy/data failed. Make sure you have the " - "required permissions and try re-running the command as admin, or " - "use a virtualenv. You can still import the model as a module and " - "call its load() method, or create the symlink manually.", - ) - msg.text(details) - raise - msg.good("Linking successful", details) - msg.text("You can now load the model via spacy.load('{}')".format(link_name)) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8ed92259c..8e27e44d0 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,25 +1,21 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac import shutil from pathlib import Path from wasabi import msg, get_raw_input import srsly -from ..compat import path2str from .. import util from .. import about -@plac.annotations( - input_dir=("Directory with model data", "positional", None, str), - output_dir=("Output parent directory", "positional", None, str), - meta_path=("Path to meta.json", "option", "m", str), - create_meta=("Create meta.json, even if one exists", "flag", "c", bool), - force=("Force overwriting existing model in output directory", "flag", "f", bool), -) -def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): +def package( + # fmt: off + input_dir: ("Directory with model data", "positional", None, str), + output_dir: ("Output parent directory", "positional", None, str), + meta_path: ("Path to meta.json", "option", "m", str) = None, + create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False, + force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False, + # fmt: on +): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified @@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( - "No '{}' setting found in meta.json".format(key), + f"No '{key}' setting found in meta.json", "This setting is required to build your package.", exits=1, ) @@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals if package_path.exists(): if force: - shutil.rmtree(path2str(package_path)) + shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " - "`--force` flag to overwrite existing " - "directories.".format(path=path2str(package_path)), + "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) - shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + shutil.copytree(str(input_path), str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) - msg.good("Successfully created package '{}'".format(model_name_v), main_path) + msg.good(f"Successfully created package '{model_name_v}'", main_path) msg.text("To build the package, run `python setup.py sdist` in this directory.") @@ -88,7 +83,7 @@ def generate_meta(model_path, existing_meta, msg): ("lang", "Model language", meta.get("lang", "en")), ("name", "Model name", meta.get("name", "model")), ("version", "Model version", meta.get("version", "0.0.0")), - ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__), + ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"), ("description", "Model description", meta.get("description", False)), ("author", "Author", meta.get("author", False)), ("email", "Author email", meta.get("email", False)), @@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg): TEMPLATE_SETUP = """ #!/usr/bin/env python -# coding: utf8 -from __future__ import unicode_literals - import io import json from os import path, walk @@ -190,9 +182,6 @@ include meta.json TEMPLATE_INIT = """ -# coding: utf8 -from __future__ import unicode_literals - from pathlib import Path from spacy.util import load_model_from_init_py, get_model_meta diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index c1aade2b2..690e3107d 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,107 +1,50 @@ -# coding: utf8 -from __future__ import print_function, unicode_literals - -import plac import random import numpy import time import re from collections import Counter from pathlib import Path -from thinc.v2v import Affine, Maxout -from thinc.misc import LayerNorm as LN -from thinc.neural.util import prefer_gpu +from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu +from thinc.api import CosineDistance, L2Distance from wasabi import msg import srsly +from ..gold import Example from ..errors import Errors from ..tokens import Doc from ..attrs import ID, HEAD -from .._ml import Tok2Vec, flatten, chain, create_default_optimizer -from .._ml import masked_language_model, get_cossim_loss +from ..ml.component_models import Tok2Vec +from ..ml.component_models import masked_language_model from .. import util +from ..util import create_default_optimizer from .train import _load_pretrained_tok2vec -@plac.annotations( - texts_loc=( - "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the " - "key 'tokens'", - "positional", - None, - str, - ), - vectors_model=("Name or path to spaCy model with vectors to learn from"), - output_dir=("Directory to write models to on each epoch", "positional", None, str), - width=("Width of CNN layers", "option", "cw", int), - depth=("Depth of CNN layers", "option", "cd", int), - cnn_window=("Window size for CNN layers", "option", "cW", int), - cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), - use_chars=("Whether to use character-based embedding", "flag", "chr", bool), - sa_depth=("Depth of self-attention layers", "option", "sa", int), - bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), - embed_rows=("Number of embedding rows", "option", "er", int), - loss_func=( - "Loss function to use for the objective. Either 'L2' or 'cosine'", - "option", - "L", - str, - ), - use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), - dropout=("Dropout rate", "option", "d", float), - batch_size=("Number of words per training batch", "option", "bs", int), - max_length=( - "Max words per example. Longer examples are discarded", - "option", - "xw", - int, - ), - min_length=( - "Min words per example. Shorter examples are discarded", - "option", - "nw", - int, - ), - seed=("Seed for random number generators", "option", "s", int), - n_iter=("Number of iterations to pretrain", "option", "i", int), - n_save_every=("Save model every X batches.", "option", "se", int), - init_tok2vec=( - "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", - "option", - "t2v", - Path, - ), - epoch_start=( - "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been " - "renamed. Prevents unintended overwriting of existing weight files.", - "option", - "es", - int, - ), -) def pretrain( - texts_loc, - vectors_model, - output_dir, - width=96, - depth=4, - bilstm_depth=0, - cnn_pieces=3, - sa_depth=0, - use_chars=False, - cnn_window=1, - embed_rows=2000, - loss_func="cosine", - use_vectors=False, - dropout=0.2, - n_iter=1000, - batch_size=3000, - max_length=500, - min_length=5, - seed=0, - n_save_every=None, - init_tok2vec=None, - epoch_start=None, + # fmt: off + texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), + vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str), + output_dir: ("Directory to write models to on each epoch", "positional", None, str), + width: ("Width of CNN layers", "option", "cw", int) = 96, + conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4, + bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0, + cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3, + sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0, + use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False, + cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1, + embed_rows: ("Number of embedding rows", "option", "er", int) = 2000, + loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine", + use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False, + dropout: ("Dropout rate", "option", "d", float) = 0.2, + n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000, + batch_size: ("Number of words per training batch", "option", "bs", int) = 3000, + max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500, + min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5, + seed: ("Seed for random number generators", "option", "s", int) = 0, + n_save_every: ("Save model every X batches.", "option", "se", int) = None, + init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None, + # fmt: on ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -132,9 +75,15 @@ def pretrain( msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) + if output_dir.exists() and [p for p in output_dir.iterdir()]: + msg.warn( + "Output directory is not empty", + "It is better to use an empty directory or refer to a new output path, " + "then the new directory will be created for you.", + ) if not output_dir.exists(): output_dir.mkdir() - msg.good("Created output directory") + msg.good(f"Created output directory: {output_dir}") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") @@ -153,16 +102,16 @@ def pretrain( msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") - with msg.loading("Loading model '{}'...".format(vectors_model)): + with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) - msg.good("Loaded model '{}'".format(vectors_model)) - pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name + msg.good(f"Loaded model '{vectors_model}'") + pretrained_vectors = None if not use_vectors else nlp.vocab.vectors model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, - conv_depth=depth, + conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc @@ -172,7 +121,7 @@ def pretrain( # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text("Loaded pretrained tok2vec for: {}".format(components)) + msg.text(f"Loaded pretrained tok2vec for: {components}") # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: @@ -181,32 +130,28 @@ def pretrain( else: if not epoch_start: msg.fail( - "You have to use the '--epoch-start' argument when using a renamed weight file for " - "'--init-tok2vec'", + "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec", exits=True, ) elif epoch_start < 0: msg.fail( - "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" - % epoch_start, + f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid", exits=True, ) else: # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 - optimizer = create_default_optimizer(model.ops) + optimizer = create_default_optimizer() tracker = ProgressTracker(frequency=10000) - msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): - with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( - "wb" - ) as file_: + with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, @@ -220,7 +165,9 @@ def pretrain( skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( - util.minibatch_by_words(((text, None) for text in texts), size=batch_size) + util.minibatch_by_words( + (Example(doc=text) for text in texts), size=batch_size + ) ): docs, count = make_docs( nlp, @@ -245,7 +192,7 @@ def pretrain( # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: - msg.warn("Skipped {count} empty values".format(count=str(skip_counter))) + msg.warn(f"Skipped {skip_counter} empty values") msg.good("Successfully finished pretrain") @@ -310,13 +257,14 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] + # TODO: this code originally didn't normalize, but shouldn't normalize=True ? if objective == "L2": - d_target = prediction - target - loss = (d_target ** 2).sum() + distance = L2Distance(normalize=False) elif objective == "cosine": - loss, d_target = get_cossim_loss(prediction, target) + distance = CosineDistance(normalize=False) else: raise ValueError(Errors.E142.format(loss_func=objective)) + d_target, loss = distance(prediction, target) return loss, d_target @@ -328,18 +276,18 @@ def create_pretraining_model(nlp, tok2vec): """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) + Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann # "tok2vec" has to be the same set of processes as what the components do. - tok2vec = chain(tok2vec, flatten) + tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) model = masked_language_model(nlp.vocab, model) - model.tok2vec = tok2vec - model.output_layer = output_layer - model.begin_training([nlp.make_doc("Give it a doc to infer shapes")]) + model.set_ref("tok2vec", tok2vec) + model.set_ref("output_layer", output_layer) + model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) return model diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 4ee72fc23..5b7a02212 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - -import plac import tqdm from pathlib import Path import srsly @@ -9,18 +5,19 @@ import cProfile import pstats import sys import itertools -import thinc.extra.datasets +import ml_datasets from wasabi import msg from ..util import load_model -@plac.annotations( - model=("Model to load", "positional", None, str), - inputs=("Location of input file. '-' for stdin.", "positional", None, str), - n_texts=("Maximum number of texts to use if available", "option", "n", int), -) -def profile(model, inputs=None, n_texts=10000): +def profile( + # fmt: off + model: ("Model to load", "positional", None, str), + inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None, + n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000, + # fmt: on +): """ Profile a spaCy pipeline, to find out which functions take the most time. Input should be formatted as one JSON object per line with a key "text". @@ -32,13 +29,13 @@ def profile(model, inputs=None, n_texts=10000): if inputs is None: n_inputs = 25000 with msg.loading("Loading IMDB dataset via Thinc..."): - imdb_train, _ = thinc.extra.datasets.imdb() + imdb_train, _ = ml_datasets.imdb() inputs, _ = zip(*imdb_train) - msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) + msg.info(f"Loaded IMDB dataset and using {n_inputs} examples") inputs = inputs[:n_inputs] - with msg.loading("Loading model '{}'...".format(model)): + with msg.loading(f"Loading model '{model}'..."): nlp = load_model(model) - msg.good("Loaded model '{}'".format(model)) + msg.good(f"Loaded model '{model}'") texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") @@ -60,7 +57,7 @@ def _read_inputs(loc, msg): input_path = Path(loc) if not input_path.exists() or not input_path.is_file(): msg.fail("Not a valid input data file", loc, exits=1) - msg.info("Using data from {}".format(input_path.parts[-1])) + msg.info(f"Using data from {input_path.parts[-1]}") file_ = input_path.open() for line in file_: data = srsly.json_loads(line) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0a9285863..d8514095b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,11 +1,7 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - -import plac import os import tqdm from pathlib import Path -from thinc.neural._classes.model import Model +from thinc.api import use_ops from timeit import default_timer as timer import shutil import srsly @@ -13,76 +9,53 @@ from wasabi import msg import contextlib import random -from .._ml import create_default_optimizer +from ..util import create_default_optimizer from ..util import use_gpu as set_gpu from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus -from ..compat import path2str from .. import util from .. import about -@plac.annotations( - # fmt: off - lang=("Model language", "positional", None, str), - output_path=("Output directory to store model in", "positional", None, Path), - train_path=("Location of JSON-formatted training data", "positional", None, Path), - dev_path=("Location of JSON-formatted development data", "positional", None, Path), - raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), - base_model=("Name of model to update (optional)", "option", "b", str), - pipeline=("Comma-separated names of pipeline components", "option", "p", str), - replace_components=("Replace components from base model", "flag", "R", bool), - vectors=("Model to load vectors from", "option", "v", str), - n_iter=("Number of iterations", "option", "n", int), - n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), - n_examples=("Number of examples", "option", "ns", int), - use_gpu=("Use GPU", "option", "g", int), - version=("Model version", "option", "V", str), - meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), - init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path), - parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str), - entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str), - noise_level=("Amount of corruption for data augmentation", "option", "nl", float), - orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), - eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), - gold_preproc=("Use gold preprocessing", "flag", "G", bool), - learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), - textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), - textcat_arch=("Textcat model architecture", "option", "ta", str), - textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), - verbose=("Display more information for debug", "flag", "VV", bool), - debug=("Run data diagnostics before training", "flag", "D", bool), - # fmt: on -) def train( - lang, - output_path, - train_path, - dev_path, - raw_text=None, - base_model=None, - pipeline="tagger,parser,ner", - replace_components=False, - vectors=None, - n_iter=30, - n_early_stopping=None, - n_examples=0, - use_gpu=-1, - version="0.0.0", - meta_path=None, - init_tok2vec=None, - parser_multitasks="", - entity_multitasks="", - noise_level=0.0, - orth_variant_level=0.0, - eval_beam_widths="", - gold_preproc=False, - learn_tokens=False, - textcat_multilabel=False, - textcat_arch="bow", - textcat_positive_label=None, - verbose=False, - debug=False, + # fmt: off + lang: ("Model language", "positional", None, str), + output_path: ("Output directory to store model in", "positional", None, Path), + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, + base_model: ("Name of model to update (optional)", "option", "b", str) = None, + pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner", + vectors: ("Model to load vectors from", "option", "v", str) = None, + replace_components: ("Replace components from base model", "flag", "R", bool) = False, + width: ("Width of CNN layers of Tok2Vec component", "option", "cw", int) = 96, + conv_depth: ("Depth of CNN layers of Tok2Vec component", "option", "cd", int) = 4, + cnn_window: ("Window size for CNN layers of Tok2Vec component", "option", "cW", int) = 1, + cnn_pieces: ("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int) = 3, + use_chars: ("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool) = False, + bilstm_depth: ("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int) = 0, + embed_rows: ("Number of embedding rows of Tok2Vec component", "option", "er", int) = 2000, + n_iter: ("Number of iterations", "option", "n", int) = 30, + n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None, + n_examples: ("Number of examples", "option", "ns", int) = 0, + use_gpu: ("Use GPU", "option", "g", int) = -1, + version: ("Model version", "option", "V", str) = "0.0.0", + meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None, + init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "", + entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "", + noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0, + orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0, + eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "", + gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, + learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False, + textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False, + textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow", + textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None, + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + verbose: ("Display more information for debug", "flag", "VV", bool) = False, + debug: ("Run data diagnostics before training", "flag", "D", bool) = False, + # fmt: on ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -116,7 +89,11 @@ def train( ) if not output_path.exists(): output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly @@ -145,28 +122,29 @@ def train( # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] + msg.text(f"Training pipeline: {pipeline}") disabled_pipes = None pipes_added = False - msg.text("Training pipeline: {}".format(pipeline)) + msg.text(f"Training pipeline: {pipeline}") if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: - msg.warn("Exception: {}".format(e)) + msg.warn(f"Exception: {e}") if activated_gpu is not None: - msg.text("Using GPU: {}".format(use_gpu)) + msg.text(f"Using GPU: {use_gpu}") else: - msg.warn("Unable to activate GPU: {}".format(use_gpu)) + msg.warn(f"Unable to activate GPU: {use_gpu}") msg.text("Using CPU only") use_gpu = -1 if base_model: - msg.text("Starting with base model '{}'".format(base_model)) + msg.text(f"Starting with base model '{base_model}'") nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( - "Model language ('{}') doesn't match language specified as " - "`lang` argument ('{}') ".format(nlp.lang, lang), + f"Model language ('{nlp.lang}') doesn't match language " + f"specified as `lang` argument ('{lang}') ", exits=1, ) for pipe in pipeline: @@ -180,11 +158,11 @@ def train( "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: - msg.text("Adding component to base model '{}'".format(pipe)) + msg.text(f"Adding component to base model '{pipe}'") nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: - msg.text("Replacing component from base model '{}'".format(pipe)) + msg.text(f"Replacing component from base model '{pipe}'") nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: @@ -197,17 +175,17 @@ def train( } if base_cfg != pipe_cfg: msg.fail( - "The base textcat model configuration does" - "not match the provided training options. " - "Existing cfg: {}, provided cfg: {}".format( - base_cfg, pipe_cfg - ), + f"The base textcat model configuration does" + f"not match the provided training options. " + f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", exits=1, ) - msg.text("Extending component from base model '{}'".format(pipe)) - disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) + msg.text(f"Extending component from base model '{pipe}'") + disabled_pipes = nlp.disable_pipes( + [p for p in nlp.pipe_names if p not in pipeline] + ) else: - msg.text("Starting with blank model '{}'".format(lang)) + msg.text(f"Starting with blank model '{lang}'") lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: @@ -223,8 +201,11 @@ def train( pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) + if vectors: - msg.text("Loading vector from model '{}'".format(vectors)) + msg.text(f"Loading vector from model '{vectors}'") _load_vectors(nlp, vectors) # Multitask objectives @@ -233,49 +214,56 @@ def train( if multitasks: if pipe_name not in pipeline: msg.fail( - "Can't use multitask objective without '{}' in the " - "pipeline".format(pipe_name) + f"Can't use multitask objective without '{pipe_name}' in " + f"the pipeline" ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus - msg.text("Counting training words (limit={})".format(n_examples)) + msg.text(f"Counting training words (limit={n_examples})") corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer - optimizer = create_default_optimizer(Model.ops) + optimizer = create_default_optimizer() else: # Start with a blank model, call begin_training - optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) - + cfg = {"device": use_gpu} + cfg["conv_depth"] = conv_depth + cfg["token_vector_width"] = width + cfg["bilstm_depth"] = bilstm_depth + cfg["cnn_maxout_pieces"] = cnn_pieces + cfg["embed_size"] = embed_rows + cfg["conv_window"] = cnn_window + cfg["subword_features"] = not use_chars + optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text("Loaded pretrained tok2vec for: {}".format(components)) + msg.text(f"Loaded pretrained tok2vec for: {components}") # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( - "The textcat_positive_label (tpl) '{}' does not match any " - "label in the training data.".format(textcat_positive_label), + f"The textcat_positive_label (tpl) '{textcat_positive_label}' " + f"does not match any label in the training data.", exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( - "A textcat_positive_label (tpl) '{}' was provided for training " - "data that does not appear to be a binary classification " - "problem with two labels.".format(textcat_positive_label), + "A textcat_positive_label (tpl) '{textcat_positive_label}' was " + "provided for training data that does not appear to be a " + "binary classification problem with two labels.", exits=1, ) - train_docs = corpus.train_docs( + train_data = corpus.train_data( nlp, noise_level=noise_level, gold_preproc=gold_preproc, @@ -285,9 +273,9 @@ def train( train_labels = set() if textcat_multilabel: multilabel_found = False - for text, gold in train_docs: - train_labels.update(gold.cats.keys()) - if list(gold.cats.values()).count(1.0) != 1: + for ex in train_data: + train_labels.update(ex.gold.cats.keys()) + if list(ex.gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn( @@ -297,9 +285,9 @@ def train( "mutually-exclusive classes." ) if not textcat_multilabel: - for text, gold in train_docs: - train_labels.update(gold.cats.keys()) - if list(gold.cats.values()).count(1.0) != 1 and not base_model: + for ex in train_data: + train_labels.update(ex.gold.cats.keys()) + if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " @@ -311,20 +299,20 @@ def train( break if base_model and set(textcat_labels) != train_labels: msg.fail( - "Cannot extend textcat model using data with different " - "labels. Base model labels: {}, training data labels: " - "{}.".format(textcat_labels, list(train_labels)), + f"Cannot extend textcat model using data with different " + f"labels. Base model labels: {textcat_labels}, training data " + f"labels: {list(train_labels)}", exits=1, ) if textcat_multilabel: msg.text( - "Textcat evaluation score: ROC AUC score macro-averaged across " - "the labels '{}'".format(", ".join(textcat_labels)) + f"Textcat evaluation score: ROC AUC score macro-averaged across " + f"the labels '{', '.join(textcat_labels)}'" ) elif textcat_positive_label and len(textcat_labels) == 2: msg.text( - "Textcat evaluation score: F1-score for the " - "label '{}'".format(textcat_positive_label) + f"Textcat evaluation score: F1-score for the " + f"label '{textcat_positive_label}'" ) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: @@ -334,8 +322,8 @@ def train( "an evaluation on the positive class." ) msg.text( - "Textcat evaluation score: F1-score macro-averaged across " - "the labels '{}'".format(", ".join(textcat_labels)) + f"Textcat evaluation score: F1-score macro-averaged across " + f"the labels '{', '.join(textcat_labels)}'" ) else: msg.fail( @@ -355,7 +343,7 @@ def train( iter_since_best = 0 best_score = 0.0 for i in range(n_iter): - train_docs = corpus.train_docs( + train_data = corpus.train_dataset( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, @@ -371,73 +359,82 @@ def train( words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} - for batch in util.minibatch_by_words(train_docs, size=batch_sizes): + for batch in util.minibatch_by_words(train_data, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) - nlp.update( - docs, - golds, - sgd=optimizer, - drop=next(dropout_rates), - losses=losses, - ) + try: + nlp.update( + docs, + golds, + sgd=optimizer, + drop=next(dropout_rates), + losses=losses, + ) + except ValueError as e: + msg.warn("Error during training") + if init_tok2vec: + msg.warn( + "Did you provide the same parameters during 'train' as during 'pretrain'?" + ) + msg.fail(f"Original error message: {e}", exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) + docs = [ex.doc for ex in batch] if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) - epoch_model_path = output_path / ("model%d" % i) + epoch_model_path = output_path / f"model{i}" nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width - dev_docs = list( - corpus.dev_docs( + dev_dataset = list( + corpus.dev_dataset( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) - nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + nwords = sum(len(ex.doc) for ex in dev_dataset) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) + scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) - with Model.use_device("cpu"): + with use_ops("numpy"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width - dev_docs = list( - corpus.dev_docs( + dev_dataset = list( + corpus.dev_dataset( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) + scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) - acc_loc = output_path / ("model%d" % i) / "accuracy.json" + acc_loc = output_path / f"model{i}" / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = ">=%s" % about.__version__ + meta["spacy_version"] = f">={about.__version__}" if beam_width == 1: meta["speed"] = { "nwords": nwords, @@ -465,10 +462,10 @@ def train( "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } - meta.setdefault("name", "model%d" % i) + meta.setdefault("name", f"model{i}") meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] - meta_loc = output_path / ("model%d" % i) / "meta.json" + meta_loc = output_path / f"model{i}" / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) @@ -486,8 +483,8 @@ def train( for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( - "Textcat ROC AUC score is undefined due to " - "only one value in label '{}'.".format(cat) + f"Textcat ROC AUC score is undefined due to " + f"only one value in label '{cat}'." ) msg.row(progress, **row_settings) # Early stopping @@ -500,14 +497,14 @@ def train( best_score = current_score if iter_since_best >= n_early_stopping: msg.text( - "Early stopping, best iteration " - "is: {}".format(i - iter_since_best) + f"Early stopping, best iteration is: {i - iter_since_best}" ) msg.text( - "Best score = {}; Final iteration " - "score = {}".format(best_score, current_score) + f"Best score = {best_score}; Final iteration score = {current_score}" ) break + except Exception as e: + msg.warn(f"Aborting and saving final best model. Encountered exception: {e}") finally: best_pipes = nlp.pipe_names if disabled_pipes: @@ -535,6 +532,8 @@ def _score_for_model(meta): mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) if "textcat" in pipes: mean_acc.append(acc["textcat_score"]) + if "sentrec" in pipes: + mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3) return sum(mean_acc) / len(mean_acc) @@ -580,12 +579,10 @@ def _collate_best_model(meta, output_path, components): for component in components: bests[component] = _find_best(output_path, component) best_dest = output_path / "model-best" - shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest)) + shutil.copytree(str(output_path / "model-final"), str(best_dest)) for component, best_component_src in bests.items(): - shutil.rmtree(path2str(best_dest / component)) - shutil.copytree( - path2str(best_component_src / component), path2str(best_dest / component) - ) + shutil.rmtree(str(best_dest / component)) + shutil.copytree(str(best_component_src / component), str(best_dest / component)) accs = srsly.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): meta["accuracy"][metric] = accs[metric] @@ -608,11 +605,13 @@ def _find_best(experiment_dir, component): def _get_metrics(component): if component == "parser": - return ("las", "uas", "las_per_type", "token_acc") + return ("las", "uas", "las_per_type", "token_acc", "sent_f") elif component == "tagger": return ("tags_acc",) elif component == "ner": - return ("ents_f", "ents_p", "ents_r", "ents_per_type") + return ("ents_f", "ents_p", "ents_r", "enty_per_type") + elif component == "sentrec": + return ("sent_f", "sent_p", "sent_r") elif component == "textcat": return ("textcat_score",) return ("token_acc",) @@ -626,14 +625,21 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): row_head.extend(["Tag Loss ", " Tag % "]) output_stats.extend(["tag_loss", "tags_acc"]) elif pipe == "parser": - row_head.extend(["Dep Loss ", " UAS ", " LAS "]) - output_stats.extend(["dep_loss", "uas", "las"]) + row_head.extend( + ["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"] + ) + output_stats.extend( + ["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"] + ) elif pipe == "ner": row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "]) output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"]) elif pipe == "textcat": row_head.extend(["Textcat Loss", "Textcat"]) output_stats.extend(["textcat_loss", "textcat_score"]) + elif pipe == "sentrec": + row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"]) + output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"]) row_head.extend(["Token %", "CPU WPS"]) output_stats.extend(["token_acc", "cpu_wps"]) @@ -643,7 +649,10 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): if has_beam_widths: row_head.insert(1, "Beam W.") - return row_head, output_stats + # remove duplicates + row_head_dict = {k: 1 for k in row_head} + output_stats_dict = {k: 1 for k in output_stats} + return row_head_dict.keys(), output_stats_dict.keys() def _get_progress( @@ -656,6 +665,7 @@ def _get_progress( scores["ner_loss"] = losses.get("ner", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0) scores["textcat_loss"] = losses.get("textcat", 0.0) + scores["sentrec_loss"] = losses.get("sentrec", 0.0) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 scores.update(dev_scores) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py new file mode 100644 index 000000000..9150da356 --- /dev/null +++ b/spacy/cli/train_from_config.py @@ -0,0 +1,439 @@ +from typing import Optional, Dict, List, Union, Sequence +import plac +from wasabi import msg +from pathlib import Path +import thinc +import thinc.schedules +from thinc.api import Model +from pydantic import BaseModel, FilePath, StrictInt +import tqdm + +# TODO: relative imports? +import spacy +from spacy.gold import GoldCorpus +from spacy.pipeline.tok2vec import Tok2VecListener +from spacy.ml import component_models +from spacy import util + + +registry = util.registry + +CONFIG_STR = """ +[training] +patience = 10 +eval_frequency = 10 +dropout = 0.2 +init_tok2vec = null +vectors = null +max_epochs = 100 +orth_variant_level = 0.0 +gold_preproc = false +max_length = 0 +use_gpu = 0 +scores = ["ents_p", "ents_r", "ents_f"] +score_weights = {"ents_f": 1.0} +limit = 0 + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "transition_based_ner.v1" +nr_feature_tokens = 3 +hidden_width = 64 +maxout_pieces = 3 + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "hash_embed_cnn.v1" +pretrained_vectors = ${nlp:vectors} +width = 128 +depth = 4 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +""" + + +class PipelineComponent(BaseModel): + factory: str + model: Model + + class Config: + arbitrary_types_allowed = True + + +class ConfigSchema(BaseModel): + optimizer: Optional["Optimizer"] + + class training(BaseModel): + patience: int = 10 + eval_frequency: int = 100 + dropout: float = 0.2 + init_tok2vec: Optional[FilePath] = None + vectors: Optional[str] = None + max_epochs: int = 100 + orth_variant_level: float = 0.0 + gold_preproc: bool = False + max_length: int = 0 + use_gpu: int = 0 + scores: List[str] = ["ents_p", "ents_r", "ents_f"] + score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} + limit: int = 0 + batch_size: Union[Sequence[int], int] + + class nlp(BaseModel): + lang: str + vectors: Optional[str] + pipeline: Optional[Dict[str, PipelineComponent]] + + class Config: + extra = "allow" + + +# Of course, these would normally decorate the functions where they're defined. +# But for now... +@registry.architectures.register("hash_embed_cnn.v1") +def hash_embed_cnn( + pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size +): + return component_models.Tok2Vec( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + conv_depth=depth, + cnn_maxout_pieces=maxout_pieces, + bilstm_depth=0, + window_size=window_size, + ) + + +@registry.architectures.register("hash_embed_bilstm.v1") +def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size): + return component_models.Tok2Vec( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + bilstm_depth=depth, + conv_depth=0, + cnn_maxout_pieces=0, + ) + + +@registry.architectures.register("tagger_model.v1") +def build_tagger_model_v1(tok2vec): + return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec) + + +@registry.architectures.register("transition_based_parser.v1") +def create_tb_parser_model( + tok2vec: Model, + nr_feature_tokens: StrictInt = 3, + hidden_width: StrictInt = 64, + maxout_pieces: StrictInt = 3, +): + from thinc.api import Linear, chain, list2array, use_ops, zero_init + from spacy.ml._layers import PrecomputableAffine + from spacy.syntax._parser_model import ParserModel + + token_vector_width = tok2vec.get_dim("nO") + tok2vec = chain(tok2vec, list2array()) + tok2vec.set_dim("nO", token_vector_width) + + lower = PrecomputableAffine( + hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces + ) + lower.set_dim("nP", maxout_pieces) + with use_ops("numpy"): + # Initialize weights at zero, as it's a classification layer. + upper = Linear(init_W=zero_init) + return ParserModel(tok2vec, lower, upper) + + +@plac.annotations( + # fmt: off + train_path=("Location of JSON-formatted training data", "positional", None, Path), + dev_path=("Location of JSON-formatted development data", "positional", None, Path), + config_path=("Path to config file", "positional", None, Path), + output_path=("Output directory to store model in", "option", "o", Path), + meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), + raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), + # fmt: on +) +def train_from_config_cli( + train_path, + dev_path, + config_path, + output_path=None, + meta_path=None, + raw_text=None, + debug=False, + verbose=False, +): + """ + Train or update a spaCy model. Requires data to be formatted in spaCy's + JSON format. To convert data from other formats, use the `spacy convert` + command. + """ + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) + if not train_path or not train_path.exists(): + msg.fail("Training data not found", train_path, exits=1) + if not dev_path or not dev_path.exists(): + msg.fail("Development data not found", dev_path, exits=1) + if meta_path is not None and not meta_path.exists(): + msg.fail("Can't find model meta.json", meta_path, exits=1) + if output_path is not None and not output_path.exists(): + output_path.mkdir() + + try: + train_from_config( + config_path, + {"train": train_path, "dev": dev_path}, + output_path=output_path, + meta_path=meta_path, + raw_text=raw_text, + ) + except KeyboardInterrupt: + msg.warn("Cancelled.") + + +def train_from_config( + config_path, data_paths, raw_text=None, meta_path=None, output_path=None, +): + msg.info(f"Loading config from: {config_path}") + config = util.load_from_config(config_path, create_objects=True) + use_gpu = config["training"]["use_gpu"] + if use_gpu >= 0: + msg.info("Using GPU") + else: + msg.info("Using CPU") + msg.info("Creating nlp from config") + nlp = create_nlp_from_config(**config["nlp"]) + optimizer = config["optimizer"] + limit = config["training"]["limit"] + msg.info("Loading training corpus") + corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) + msg.info("Initializing the nlp pipeline") + nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) + + train_batches = create_train_batches(nlp, corpus, config["training"]) + evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"]) + + # Create iterator, which yields out info after each optimization step. + msg.info("Start training") + training_step_iterator = train_while_improving( + nlp, + optimizer, + train_batches, + evaluate, + config["training"]["dropout"], + config["training"]["patience"], + config["training"]["eval_frequency"], + ) + + msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") + print_row = setup_printer(config) + + try: + progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False) + for batch, info, is_best_checkpoint in training_step_iterator: + progress.update(1) + if is_best_checkpoint is not None: + progress.close() + print_row(info) + if is_best_checkpoint and output_path is not None: + nlp.to_disk(output_path) + progress = tqdm.tqdm( + total=config["training"]["eval_frequency"], leave=False + ) + finally: + if output_path is not None: + with nlp.use_params(optimizer.averages): + final_model_path = output_path / "model-final" + nlp.to_disk(final_model_path) + msg.good("Saved model to output directory", final_model_path) + # with msg.loading("Creating best model..."): + # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) + # msg.good("Created best model", best_model_path) + + +def create_nlp_from_config(lang, vectors, pipeline): + lang_class = spacy.util.get_lang_class(lang) + nlp = lang_class() + if vectors is not None: + spacy.cli.train._load_vectors(nlp, vectors) + for name, component_cfg in pipeline.items(): + factory = component_cfg.pop("factory") + component = nlp.create_pipe(factory, config=component_cfg) + nlp.add_pipe(component, name=name) + return nlp + + +def create_train_batches(nlp, corpus, cfg): + while True: + train_examples = corpus.train_dataset( + nlp, + noise_level=0.0, + orth_variant_level=cfg["orth_variant_level"], + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"], + ignore_misaligned=True, + ) + for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]): + yield batch + + +def create_evaluation_callback(nlp, optimizer, corpus, cfg): + def evaluate(): + with nlp.use_params(optimizer.averages): + dev_examples = list( + corpus.dev_dataset( + nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True + ) + ) + scorer = nlp.evaluate(dev_examples) + scores = scorer.scores + # Calculate a weighted sum based on score_weights for the main score + weights = cfg["score_weights"] + weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + return weighted_score, scorer.scores + + return evaluate + + +def train_while_improving( + nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency +): + """Train until an evaluation stops improving. Works as a generator, + with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, + where info is a dict, and is_best_checkpoint is in [True, False, None] -- + None indicating that the iteration was not evaluated as a checkpoint. + The evaluation is conducted by calling the evaluate callback, which should + + Positional arguments: + nlp: The spaCy pipeline to evaluate. + train_data (Iterable[Batch]): A generator of batches, with the training + data. Each batch should be a Sized[Tuple[Input, Annot]]. The training + data iterable needs to take care of iterating over the epochs and + shuffling. + evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. + The callback should take no arguments and return a tuple + `(main_score, other_scores)`. The main_score should be a float where + higher is better. other_scores can be any object. + + Every iteration, the function yields out a tuple with: + + * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs. + * info: A dict with various information about the last update (see below). + * is_best_checkpoint: A value in None, False, True, indicating whether this + was the best evaluation so far. You should use this to save the model + checkpoints during training. If None, evaluation was not conducted on + that iteration. False means evaluation was conducted, but a previous + evaluation was better. + + The info dict provides the following information: + + epoch (int): How many passes over the data have been completed. + step (int): How many steps have been completed. + score (float): The main score form the last evaluation. + other_scores: : The other scores from the last evaluation. + loss: The accumulated losses throughout training. + checkpoints: A list of previous results, where each result is a + (score, step, epoch) tuple. + """ + if isinstance(dropout, float): + dropouts = thinc.schedules.constant(dropout) + else: + dropouts = dropout + results = [] + losses = {} + for step, batch in enumerate(train_data): + dropout = next(dropouts) + for subbatch in subdivide_batch(batch): + nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) + for name, proc in nlp.pipeline: + if hasattr(proc, "model"): + proc.model.finish_update(optimizer) + optimizer.step_schedules() + if not (step % eval_frequency): + score, other_scores = evaluate() + results.append((score, step)) + is_best_checkpoint = score == max(results)[0] + else: + score, other_scores = (None, None) + is_best_checkpoint = None + info = { + "step": step, + "score": score, + "other_scores": other_scores, + "losses": losses, + "checkpoints": results, + } + yield batch, info, is_best_checkpoint + if is_best_checkpoint is not None: + losses = {} + # Stop if no improvement in `patience` updates + best_score, best_step = max(results) + if (step - best_step) >= patience: + break + + +def subdivide_batch(batch): + return [batch] + + +def setup_printer(config): + score_cols = config["training"]["scores"] + score_widths = [max(len(col), 6) for col in score_cols] + loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]] + loss_widths = [max(len(col), 8) for col in loss_cols] + table_header = ["#"] + loss_cols + score_cols + ["Score"] + table_header = [col.upper() for col in table_header] + table_widths = [6] + loss_widths + score_widths + [6] + table_aligns = ["r" for _ in table_widths] + + msg.row(table_header, widths=table_widths) + msg.row(["-" * width for width in table_widths]) + + def print_row(info): + losses = [ + "{0:.2f}".format(info["losses"].get(col, 0.0)) + for col in config["nlp"]["pipeline"] + ] + scores = [ + "{0:.2f}".format(info["other_scores"].get(col, 0.0)) + for col in config["training"]["scores"] + ] + data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])] + msg.row(data, widths=table_widths, aligns=table_aligns) + + return print_row + + +@registry.architectures.register("tok2vec_tensors.v1") +def tok2vec_tensors_v1(width): + tok2vec = Tok2VecListener("tok2vec", width=width) + return tok2vec diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 93abad6f6..a23ce3453 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,14 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - from pathlib import Path import sys import requests -import srsly from wasabi import msg -from ..compat import path2str -from ..util import get_data_path from .. import about @@ -17,51 +11,30 @@ def validate(): Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ - with msg.loading("Loading compatibility table..."): - r = requests.get(about.__compatibility__) - if r.status_code != 200: - msg.fail( - "Server error ({})".format(r.status_code), - "Couldn't fetch compatibility table.", - exits=1, - ) - msg.good("Loaded compatibility table") - compat = r.json()["spacy"] - version = about.__version__ - version = version.rsplit(".dev", 1)[0] - current_compat = compat.get(version) + model_pkgs, compat = get_model_pkgs() + spacy_version = about.__version__.rsplit(".dev", 1)[0] + current_compat = compat.get(spacy_version, {}) if not current_compat: - msg.fail( - "Can't find spaCy v{} in compatibility table".format(version), - about.__compatibility__, - exits=1, - ) - all_models = set() - for spacy_v, models in dict(compat).items(): - all_models.update(models.keys()) - for model, model_vs in models.items(): - compat[spacy_v][model] = [reformat_version(v) for v in model_vs] - model_links = get_model_links(current_compat) - model_pkgs = get_model_pkgs(current_compat, all_models) - incompat_links = {l for l, d in model_links.items() if not d["compat"]} + msg.warn(f"No compatible models found for v{spacy_version} of spaCy") incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} - incompat_models.update( - [d["name"] for _, d in model_links.items() if not d["compat"]] - ) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent - msg.divider("Installed models (spaCy v{})".format(about.__version__)) - msg.info("spaCy installation: {}".format(path2str(spacy_dir))) + msg.divider(f"Installed models (spaCy v{about.__version__})") + msg.info(f"spaCy installation: {spacy_dir}") - if model_links or model_pkgs: - header = ("TYPE", "NAME", "MODEL", "VERSION", "") + if model_pkgs: + header = ("NAME", "VERSION", "") rows = [] for name, data in model_pkgs.items(): - rows.append(get_model_row(current_compat, name, data, msg)) - for name, data in model_links.items(): - rows.append(get_model_row(current_compat, name, data, msg, "link")) + if data["compat"]: + comp = msg.text("", color="green", icon="good", no_print=True) + version = msg.text(data["version"], color="green", no_print=True) + else: + version = msg.text(data["version"], color="red", no_print=True) + comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" + rows.append((data["name"], version, comp)) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) @@ -71,44 +44,32 @@ def validate(): cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: - msg.text( - "The following models are not available for spaCy " - "v{}: {}".format(about.__version__, ", ".join(na_models)) + msg.warn( + f"The following models are not available for spaCy v{about.__version__}:", + ", ".join(na_models), ) - if incompat_links: - msg.text( - "You may also want to overwrite the incompatible links using the " - "`python -m spacy link` command with `--force`, or remove them " - "from the data directory. " - "Data path: {path}".format(path=path2str(get_data_path())) - ) - if incompat_models or incompat_links: + if incompat_models: sys.exit(1) -def get_model_links(compat): - links = {} - data_path = get_data_path() - if data_path: - models = [p for p in data_path.iterdir() if is_model_path(p)] - for model in models: - meta_path = Path(model) / "meta.json" - if not meta_path.exists(): - continue - meta = srsly.read_json(meta_path) - link = model.parts[-1] - name = meta["lang"] + "_" + meta["name"] - links[link] = { - "name": name, - "version": meta["version"], - "compat": is_compat(compat, name, meta["version"]), - } - return links - - -def get_model_pkgs(compat, all_models): +def get_model_pkgs(): import pkg_resources + with msg.loading("Loading compatibility table..."): + r = requests.get(about.__compatibility__) + if r.status_code != 200: + msg.fail( + f"Server error ({r.status_code})", + "Couldn't fetch compatibility table.", + exits=1, + ) + msg.good("Loaded compatibility table") + compat = r.json()["spacy"] + all_models = set() + for spacy_v, models in dict(compat).items(): + all_models.update(models.keys()) + for model, model_vs in models.items(): + compat[spacy_v][model] = [reformat_version(v) for v in model_vs] pkgs = {} for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): package = pkg_name.replace("-", "_") @@ -117,29 +78,9 @@ def get_model_pkgs(compat, all_models): pkgs[pkg_name] = { "name": package, "version": version, - "compat": is_compat(compat, package, version), + "compat": package in compat and version in compat[package], } - return pkgs - - -def get_model_row(compat, name, data, msg, model_type="package"): - if data["compat"]: - comp = msg.text("", color="green", icon="good", no_print=True) - version = msg.text(data["version"], color="green", no_print=True) - else: - version = msg.text(data["version"], color="red", no_print=True) - comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0]) - return (model_type, name, data["name"], version, comp) - - -def is_model_path(model_path): - exclude = ["cache", "pycache", "__pycache__"] - name = model_path.parts[-1] - return model_path.is_dir() and name not in exclude and not name.startswith(".") - - -def is_compat(compat, name, version): - return name in compat and version in compat[name] + return pkgs, compat def reformat_version(version): diff --git a/spacy/compat.py b/spacy/compat.py index 0ea31c6b3..d8377633f 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -1,4 +1,3 @@ -# coding: utf8 """ Helpers for Python and platform compatibility. To distinguish them from the builtin functions, replacement functions are suffixed with an underscore, @@ -6,15 +5,9 @@ e.g. `unicode_`. DOCS: https://spacy.io/api/top-level#compat """ -from __future__ import unicode_literals - -import os import sys -import itertools -import ast -import types -from thinc.neural.util import copy_array +from thinc.util import copy_array try: import cPickle as pickle @@ -36,91 +29,23 @@ try: except ImportError: cupy = None -try: - from thinc.neural.optimizers import Optimizer # noqa: F401 -except ImportError: - from thinc.neural.optimizers import Adam as Optimizer # noqa: F401 +from thinc.api import Optimizer # noqa: F401 pickle = pickle copy_reg = copy_reg CudaStream = CudaStream cupy = cupy copy_array = copy_array -izip = getattr(itertools, "izip", zip) is_windows = sys.platform.startswith("win") is_linux = sys.platform.startswith("linux") is_osx = sys.platform == "darwin" -# See: https://github.com/benjaminp/six/blob/master/six.py -is_python2 = sys.version_info[0] == 2 -is_python3 = sys.version_info[0] == 3 -is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5) -if is_python2: - bytes_ = str - unicode_ = unicode # noqa: F821 - basestring_ = basestring # noqa: F821 - input_ = raw_input # noqa: F821 - path2str = lambda path: str(path).decode("utf8") - class_types = (type, types.ClassType) - -elif is_python3: - bytes_ = bytes - unicode_ = str - basestring_ = str - input_ = input - path2str = lambda path: str(path) - class_types = (type, types.ClassType) if is_python_pre_3_5 else type - - -def b_to_str(b_str): - """Convert a bytes object to a string. - - b_str (bytes): The object to convert. - RETURNS (unicode): The converted string. - """ - if is_python2: - return b_str - # Important: if no encoding is set, string becomes "b'...'" - return str(b_str, encoding="utf8") - - -def symlink_to(orig, dest): - """Create a symlink. Used for model shortcut links. - - orig (unicode / Path): The origin path. - dest (unicode / Path): The destination path of the symlink. - """ - if is_windows: - import subprocess - - subprocess.check_call( - ["mklink", "/d", path2str(orig), path2str(dest)], shell=True - ) - else: - orig.symlink_to(dest) - - -def symlink_remove(link): - """Remove a symlink. Used for model shortcut links. - - link (unicode / Path): The path to the symlink. - """ - # https://stackoverflow.com/q/26554135/6400719 - if os.path.isdir(path2str(link)) and is_windows: - # this should only be on Py2.7 and windows - os.rmdir(path2str(link)) - else: - os.unlink(path2str(link)) - - -def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): +def is_config(windows=None, linux=None, osx=None, **kwargs): """Check if a specific configuration of Python version and operating system matches the user's setup. Mostly used to display targeted error messages. - python2 (bool): spaCy is executed with Python 2.x. - python3 (bool): spaCy is executed with Python 3.x. windows (bool): spaCy is executed on Windows. linux (bool): spaCy is executed on Linux. osx (bool): spaCy is executed on OS X or macOS. @@ -129,53 +54,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): DOCS: https://spacy.io/api/top-level#compat.is_config """ return ( - python2 in (None, is_python2) - and python3 in (None, is_python3) - and windows in (None, is_windows) + windows in (None, is_windows) and linux in (None, is_linux) and osx in (None, is_osx) ) - - -def import_file(name, loc): - """Import module from a file. Used to load models from a directory. - - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. - RETURNS: The loaded module. - """ - loc = path2str(loc) - if is_python_pre_3_5: - import imp - - return imp.load_source(name, loc) - else: - import importlib.util - - spec = importlib.util.spec_from_file_location(name, str(loc)) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def unescape_unicode(string): - """Python2.7's re module chokes when compiling patterns that have ranges - between escaped unicode codepoints if the two codepoints are unrecognised - in the unicode database. For instance: - - re.compile('[\\uAA77-\\uAA79]').findall("hello") - - Ends up matching every character (on Python 2). This problem doesn't occur - if we're dealing with unicode literals. - """ - if string is None: - return string - # We only want to unescape the unicode, so we first must protect the other - # backslashes. - string = string.replace("\\", "\\\\") - # Now we remove that protection for the unicode. - string = string.replace("\\\\u", "\\u") - string = string.replace("\\\\U", "\\U") - # Now we unescape by evaling the string with the AST. This can't execute - # code -- it only does the representational level. - return ast.literal_eval("u'''" + string + "'''") diff --git a/spacy/data/__init__.py b/spacy/data/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index c17b80aef..e4a8ad666 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -1,15 +1,11 @@ -# coding: utf8 """ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ -from __future__ import unicode_literals - from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span -from ..compat import b_to_str from ..errors import Errors, Warnings, user_warning from ..util import is_in_jupyter @@ -93,20 +89,20 @@ def serve( render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) - print("\nUsing the '{}' visualizer".format(style)) - print("Serving on http://{}:{} ...\n".format(host, port)) + print(f"\nUsing the '{style}' visualizer") + print(f"Serving on http://{host}:{port} ...\n") try: httpd.serve_forever() except KeyboardInterrupt: - print("Shutting down server on port {}.".format(port)) + print(f"Shutting down server on port {port}.") finally: httpd.server_close() def app(environ, start_response): # Headers and status need to be bytes in Python 2, see #1227 - headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))] - start_response(b_to_str(b"200 OK"), headers) + headers = [("Content-type", "text/html; charset=utf-8")] + start_response("200 OK", headers) res = _html["parsed"].encode(encoding="utf-8") return [res] diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d6e33437b..7ca1eebb7 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import uuid from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS @@ -55,7 +52,7 @@ class DependencyRenderer(object): settings = p.get("settings", {}) self.direction = settings.get("direction", DEFAULT_DIR) self.lang = settings.get("lang", DEFAULT_LANG) - render_id = "{}-{}".format(id_prefix, i) + render_id = f"{id_prefix}-{i}" svg = self.render_svg(render_id, p["words"], p["arcs"]) rendered.append(svg) if page: diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ade75d1d6..a721ce480 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Setting explicit height and max-width: none on the SVG is required for # Jupyter to render it properly in a cell diff --git a/spacy/errors.py b/spacy/errors.py index 2f0a8a2ad..7a4953cce 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import os import warnings import inspect @@ -12,7 +9,7 @@ def add_codes(err_cls): class ErrorsWithCodes(object): def __getattribute__(self, code): msg = getattr(err_cls, code) - return "[{code}] {msg}".format(code=code, msg=msg) + return f"[{code}] {msg}" return ErrorsWithCodes() @@ -97,8 +94,6 @@ class Warnings(object): "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed.") - W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " - "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " @@ -107,7 +102,9 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") - + W028 = ("Skipping unsupported morphological feature(s): {feature}. " + "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " + "string \"Field1=Value1,Value2|Field2=Value3\".") @add_codes @@ -227,13 +224,8 @@ class Errors(object): E047 = ("Can't assign a value to unregistered extension attribute " "'{name}'. Did you forget to call the `set_extension` method?") E048 = ("Can't import language {lang} from spacy.lang: {err}") - E049 = ("Can't find spaCy data directory: '{path}'. Check your " - "installation and permissions, or use spacy.util.set_data_path " - "to customise the location if necessary.") - E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut " - "link, a Python package or a valid path to a data directory.") - E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure " - "it points to a valid package (not just a data directory).") + E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " + "package or a valid path to a data directory.") E052 = ("Can't find model directory: {path}") E053 = ("Could not read meta.json from {path}") E054 = ("No valid '{setting}' setting found in model meta.json.") @@ -424,8 +416,6 @@ class Errors(object): E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") E135 = ("If you meant to replace a built-in component, use `create_pipe`: " "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`") - E136 = ("This additional feature requires the jsonschema library to be " - "installed:\npip install jsonschema") E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure " "to provide a valid JSON object as input with either the `text` " "or `tokens` key. For more info, see the docs:\n" @@ -541,6 +531,15 @@ class Errors(object): E188 = ("Could not match the gold entity links to entities in the doc - " "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") + # TODO: fix numbering after merging develop into master + E996 = ("Could not parse {file}: {msg}") + E997 = ("Tokenizer special cases are not allowed to modify the text. " + "This would map '{chunk}' to '{orth}' given token attributes " + "'{token_attrs}'.") + E998 = ("Can only create GoldParse objects from Example objects without a " + "Doc if get_gold_parses() is called with a Vocab object.") + E999 = ("Encountered an unexpected format for the dictionary holding " + "gold annotations: {gold_dict}") @add_codes @@ -566,10 +565,10 @@ class MatchPatternError(ValueError): errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ - msg = "Invalid token patterns for matcher rule '{}'\n".format(key) + msg = f"Invalid token patterns for matcher rule '{key}'\n" for pattern_idx, error_msgs in errors.items(): - pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs]) - msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors) + pattern_errors = "\n".join([f"- {e}" for e in error_msgs]) + msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n" ValueError.__init__(self, msg) diff --git a/spacy/glossary.py b/spacy/glossary.py index 44a8277da..938a575cd 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - def explain(term): """Get a description for a given POS tag, dependency label or entity type. diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 20a25a939..aea691130 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -1,6 +1,6 @@ from cymem.cymem cimport Pool -from .structs cimport TokenC +from .tokens import Doc from .typedefs cimport attr_t from .syntax.transition_system cimport Transition @@ -19,23 +19,49 @@ cdef class GoldParse: cdef Pool mem cdef GoldParseC c + cdef readonly TokenAnnotation orig cdef int length cdef public int loss cdef public list words cdef public list tags - cdef public list morphology + cdef public list pos + cdef public list morphs + cdef public list lemmas + cdef public list sent_starts cdef public list heads cdef public list labels cdef public dict orths cdef public list ner - cdef public list ents cdef public dict brackets - cdef public object cats + cdef public dict cats cdef public dict links cdef readonly list cand_to_gold cdef readonly list gold_to_cand - cdef readonly list orig_annot +cdef class TokenAnnotation: + cdef public list ids + cdef public list words + cdef public list tags + cdef public list pos + cdef public list morphs + cdef public list lemmas + cdef public list heads + cdef public list deps + cdef public list entities + cdef public list sent_starts + cdef public list brackets + + +cdef class DocAnnotation: + cdef public object cats + cdef public object links + + +cdef class Example: + cdef public object doc + cdef public TokenAnnotation token_annotation + cdef public DocAnnotation doc_annotation + cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 3884e1cba..eca801176 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,7 +1,4 @@ # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals, print_function - import re import random import numpy @@ -14,14 +11,9 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span from .errors import Errors, AlignmentError, user_warning, Warnings -from .compat import path2str from . import util -from .util import minibatch, itershuffle - -from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek -USE_NEW_ALIGN = False punct_re = re.compile(r"\W") @@ -54,78 +46,15 @@ def tags_to_entities(tags): return entities -def merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_cats = {} - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in brackets) - m_cats.update(cats) - i += len(ids) - return [(m_deps, (m_cats, m_brackets))] - - -_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] - - def _normalize_for_alignment(tokens): tokens = [w.replace(" ", "").lower() for w in tokens] output = [] for token in tokens: token = token.replace(" ", "").lower() - for before, after in _ALIGNMENT_NORM_MAP: - token = token.replace(before, after) output.append(token) return output -def _align_before_v2_2_2(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations, using the Levenshtein - algorithm. The alignment is case-insensitive. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - from . import _align - if tokens_a == tokens_b: - alignment = numpy.arange(len(tokens_a)) - return 0, alignment, alignment, {}, {} - tokens_a = [w.replace(" ", "").lower() for w in tokens_a] - tokens_b = [w.replace(" ", "").lower() for w in tokens_b] - cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], - [len(w) for w in tokens_b]) - for i, j in list(i2j_multi.items()): - if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: - i2j[i] = j - i2j_multi.pop(i) - for j, i in list(j2i_multi.items()): - if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: - j2i[j] = i - j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi - - def align(tokens_a, tokens_b): """Calculate alignment tables between two tokenizations. @@ -144,13 +73,13 @@ def align(tokens_a, tokens_b): * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other direction. """ - if not USE_NEW_ALIGN: - return _align_before_v2_2_2(tokens_a, tokens_b) tokens_a = _normalize_for_alignment(tokens_a) tokens_b = _normalize_for_alignment(tokens_b) cost = 0 a2b = numpy.empty(len(tokens_a), dtype="i") b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) a2b_multi = {} b2a_multi = {} i = 0 @@ -160,7 +89,6 @@ def align(tokens_a, tokens_b): while i < len(tokens_a) and j < len(tokens_b): a = tokens_a[i][offset_a:] b = tokens_b[j][offset_b:] - a2b[i] = b2a[j] = -1 if a == b: if offset_a == offset_b == 0: a2b[i] = j @@ -211,30 +139,32 @@ class GoldCorpus(object): def __init__(self, train, dev, gold_preproc=False, limit=None): """Create a GoldCorpus. - train_path (unicode or Path): File or directory of training data. - dev_path (unicode or Path): File or directory of development data. + train (unicode or Path): File or directory of training data. + dev (unicode or Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ self.limit = limit if isinstance(train, str) or isinstance(train, Path): - train = self.read_tuples(self.walk_corpus(train)) - dev = self.read_tuples(self.walk_corpus(dev)) + train = self.read_examples(self.walk_corpus(train)) + dev = self.read_examples(self.walk_corpus(dev)) # Write temp directory with one doc per file, so we can shuffle and stream self.tmp_dir = Path(tempfile.mkdtemp()) self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) def __del__(self): - shutil.rmtree(path2str(self.tmp_dir)) + shutil.rmtree(self.tmp_dir) @staticmethod - def write_msgpack(directory, doc_tuples, limit=0): + def write_msgpack(directory, examples, limit=0): if not directory.exists(): directory.mkdir() n = 0 - for i, doc_tuple in enumerate(doc_tuples): - srsly.write_msgpack(directory / "{}.msg".format(i), [doc_tuple]) - n += len(doc_tuple[1]) + for i, example in enumerate(examples): + ex_dict = example.to_dict() + text = example.text + srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) + n += 1 if limit and n >= limit: break @@ -259,128 +189,163 @@ class GoldCorpus(object): return locs @staticmethod - def read_tuples(locs, limit=0): + def read_examples(locs, limit=0): + """ Yield training examples """ i = 0 for loc in locs: loc = util.ensure_path(loc) - if loc.parts[-1].endswith("json"): - gold_tuples = read_json_file(loc) - elif loc.parts[-1].endswith("jsonl"): + file_name = loc.parts[-1] + if file_name.endswith("json"): + examples = read_json_file(loc) + elif file_name.endswith("jsonl"): gold_tuples = srsly.read_jsonl(loc) first_gold_tuple = next(gold_tuples) gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) # TODO: proper format checks with schemas if isinstance(first_gold_tuple, dict): - gold_tuples = read_json_object(gold_tuples) - elif loc.parts[-1].endswith("msg"): - gold_tuples = srsly.read_msgpack(loc) + if first_gold_tuple.get("paragraphs", None): + examples = read_json_object(gold_tuples) + elif first_gold_tuple.get("doc_annotation", None): + examples = [] + for ex_dict in gold_tuples: + doc = ex_dict.get("doc", None) + if doc is None: + doc = ex_dict.get("text", None) + examples.append(Example.from_dict(ex_dict, doc=doc)) + + elif file_name.endswith("msg"): + text, ex_dict = srsly.read_msgpack(loc) + examples = [Example.from_dict(ex_dict, doc=text)] else: supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) - for item in gold_tuples: - yield item - i += len(item[1]) - if limit and i >= limit: - return + raise ValueError(Errors.E124.format(path=loc, formats=supported)) + try: + for example in examples: + yield example + i += 1 + if limit and i >= limit: + return + except KeyError as e: + msg = "Missing key {}".format(e) + raise KeyError(Errors.E996.format(file=file_name, msg=msg)) + except UnboundLocalError as e: + msg = "Unexpected document structure" + raise ValueError(Errors.E996.format(file=file_name, msg=msg)) @property - def dev_tuples(self): + def dev_examples(self): locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_tuples(locs, limit=self.limit) + yield from self.read_examples(locs, limit=self.limit) @property - def train_tuples(self): + def train_examples(self): locs = (self.tmp_dir / "train").iterdir() - yield from self.read_tuples(locs, limit=self.limit) + yield from self.read_examples(locs, limit=self.limit) def count_train(self): + """Returns count of words in train examples""" n = 0 i = 0 - for raw_text, paragraph_tuples in self.train_tuples: - for sent_tuples, brackets in paragraph_tuples: - n += len(sent_tuples[1]) - if self.limit and i >= self.limit: - break - i += 1 + for example in self.train_examples: + n += len(example.token_annotation.words) + if self.limit and i >= self.limit: + break + i += 1 return n - def train_docs(self, nlp, gold_preproc=False, max_length=None, + def train_dataset(self, nlp, gold_preproc=False, max_length=None, noise_level=0.0, orth_variant_level=0.0, ignore_misaligned=False): locs = list((self.tmp_dir / 'train').iterdir()) random.shuffle(locs) - train_tuples = self.read_tuples(locs, limit=self.limit) - gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, + train_examples = self.read_examples(locs, limit=self.limit) + gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc, max_length=max_length, noise_level=noise_level, orth_variant_level=orth_variant_level, make_projective=True, ignore_misaligned=ignore_misaligned) - yield from gold_docs + yield from gold_examples - def train_docs_without_preprocessing(self, nlp, gold_preproc=False): - gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc) - yield from gold_docs + def train_dataset_without_preprocessing(self, nlp, gold_preproc=False, + ignore_misaligned=False): + examples = self.iter_gold_docs(nlp, self.train_examples, + gold_preproc=gold_preproc, + ignore_misaligned=ignore_misaligned) + yield from examples - def dev_docs(self, nlp, gold_preproc=False, ignore_misaligned=False): - gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned) - yield from gold_docs + def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): + examples = self.iter_gold_docs(nlp, self.dev_examples, + gold_preproc=gold_preproc, + ignore_misaligned=ignore_misaligned) + yield from examples @classmethod - def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, make_projective=False, - ignore_misaligned=False): - for raw_text, paragraph_tuples in tuples: + def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, + noise_level=0.0, orth_variant_level=0.0, + make_projective=False, ignore_misaligned=False): + """ Setting gold_preproc will result in creating a doc per sentence """ + for example in examples: if gold_preproc: - raw_text = None + example.doc = None + split_examples = example.split_sents() + example_golds = [] + for split_example in split_examples: + split_example_docs = cls._make_docs(nlp, split_example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + split_example_golds = cls._make_golds(split_example_docs, + vocab=nlp.vocab, make_projective=make_projective, + ignore_misaligned=ignore_misaligned) + example_golds.extend(split_example_golds) else: - paragraph_tuples = merge_sents(paragraph_tuples) - docs, paragraph_tuples = cls._make_docs(nlp, raw_text, - paragraph_tuples, gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - golds = cls._make_golds(docs, paragraph_tuples, make_projective, - ignore_misaligned=ignore_misaligned) - for doc, gold in zip(docs, golds): - if gold is not None: - if (not max_length) or len(doc) < max_length: - yield doc, gold + example_docs = cls._make_docs(nlp, example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned) + for ex in example_golds: + if ex.goldparse is not None: + if (not max_length) or len(ex.doc) < max_length: + yield ex @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0): - if raw_text is not None: - raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level) - raw_text = add_noise(raw_text, noise_level) - return [nlp.make_doc(raw_text)], paragraph_tuples + def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0): + var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) + # gold_preproc is not used ?! + if example.text is not None: + var_text = add_noise(var_example.text, noise_level) + var_doc = nlp.make_doc(var_text) + var_example.doc = var_doc else: - docs = [] - raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level) - return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples - + var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level)) + var_example.doc = var_doc + return [var_example] @classmethod - def _make_golds(cls, docs, paragraph_tuples, make_projective, ignore_misaligned=False): - if len(docs) != len(paragraph_tuples): - n_annots = len(paragraph_tuples) - raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots)) - golds = [] - for doc, (sent_tuples, (cats, brackets)) in zip(docs, paragraph_tuples): - try: - gold = GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats, - make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gold = None - else: - raise - golds.append(gold) - return golds + def _make_golds(cls, examples, vocab=None, make_projective=False, + ignore_misaligned=False): + filtered_examples = [] + for example in examples: + gold_parses = example.get_gold_parses(vocab=vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned) + assert len(gold_parses) == 1 + doc, gold = gold_parses[0] + if doc: + assert doc == example.doc + example.goldparse = gold + filtered_examples.append(example) + return filtered_examples -def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): +def make_orth_variants(nlp, example, orth_variant_level=0.0): if random.random() >= orth_variant_level: - return raw, paragraph_tuples + return example + if not example.token_annotation: + return example + raw = example.text if random.random() >= 0.5: lower = True if raw is not None: @@ -388,9 +353,15 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): ndsv = nlp.Defaults.single_orth_variants ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples - variant_paragraph_tuples = [] - for sent_tuples, brackets in paragraph_tuples: - ids, words, tags, heads, labels, ner = sent_tuples + variant_example = Example(doc=raw) + token_annotation = example.token_annotation + words = token_annotation.words + tags = token_annotation.tags + if not words or not tags: + # add the unmodified annotation + token_dict = token_annotation.to_dict() + variant_example.set_token_annotation(**token_dict) + else: if lower: words = [w.lower() for w in words] # single variants @@ -419,7 +390,10 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): pair_idx = pair.index(words[word_idx]) words[word_idx] = punct_choices[punct_idx][pair_idx] - variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) + token_dict = token_annotation.to_dict() + token_dict["words"] = words + token_dict["tags"] = tags + variant_example.set_token_annotation(**token_dict) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] @@ -437,33 +411,32 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] raw_idx += 1 - for sent_tuples, brackets in variant_paragraph_tuples: - ids, words, tags, heads, labels, ner = sent_tuples - for word in words: - match_found = False - # add identical word - if word not in variants and raw[raw_idx:].startswith(word): - variant_raw += word - raw_idx += len(word) - match_found = True - # add variant word - else: - for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): - raw_idx += len(variant) - variant_raw += word - match_found = True - # something went wrong, abort - # (add a warning message?) - if not match_found: - return raw, paragraph_tuples - # add following whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 - return variant_raw, variant_paragraph_tuples - return raw, variant_paragraph_tuples + for word in variant_example.token_annotation.words: + match_found = False + # add identical word + if word not in variants and raw[raw_idx:].startswith(word): + variant_raw += word + raw_idx += len(word) + match_found = True + # add variant word + else: + for variant in variants: + if not match_found and \ + raw[raw_idx:].startswith(variant): + raw_idx += len(variant) + variant_raw += word + match_found = True + # something went wrong, abort + # (add a warning message?) + if not match_found: + return example + # add following whitespace + while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): + variant_raw += raw[raw_idx] + raw_idx += 1 + variant_example.doc = variant_raw + return variant_example + return variant_example def add_noise(orig, noise_level): @@ -488,52 +461,70 @@ def _corrupt(c, noise_level): def read_json_object(json_corpus_section): """Take a list of JSON-formatted documents (e.g. from an already loaded - training data file) and yield tuples in the GoldParse format. + training data file) and yield annotations in the GoldParse format. json_corpus_section (list): The data. - YIELDS (tuple): The reformatted data. + YIELDS (Example): The reformatted data - one training example per paragraph """ for json_doc in json_corpus_section: - tuple_doc = json_to_tuple(json_doc) - for tuple_paragraph in tuple_doc: - yield tuple_paragraph + examples = json_to_examples(json_doc) + for ex in examples: + yield ex -def json_to_tuple(doc): - """Convert an item in the JSON-formatted training data to the tuple format +def json_to_examples(doc): + """Convert an item in the JSON-formatted training data to the format used by GoldParse. doc (dict): One entry in the training data. - YIELDS (tuple): The reformatted data. + YIELDS (Example): The reformatted data - one training example per paragraph """ paragraphs = [] for paragraph in doc["paragraphs"]: - sents = [] - cats = {} - for cat in paragraph.get("cats", {}): - cats[cat["label"]] = cat["value"] + example = Example(doc=paragraph.get("raw", None)) + words = [] + ids = [] + tags = [] + pos = [] + morphs = [] + lemmas = [] + heads = [] + labels = [] + ner = [] + sent_starts = [] + brackets = [] for sent in paragraph["sentences"]: - words = [] - ids = [] - tags = [] - heads = [] - labels = [] - ner = [] + sent_start_i = len(words) for i, token in enumerate(sent["tokens"]): words.append(token["orth"]) - ids.append(i) + ids.append(token.get('id', sent_start_i + i)) tags.append(token.get('tag', "-")) - heads.append(token.get("head", 0) + i) + pos.append(token.get("pos", "")) + morphs.append(token.get("morph", "")) + lemmas.append(token.get("lemma", "")) + heads.append(token.get("head", 0) + sent_start_i + i) labels.append(token.get("dep", "")) # Ensure ROOT label is case-insensitive if labels[-1].lower() == "root": labels[-1] = "ROOT" ner.append(token.get("ner", "-")) - sents.append([ - [ids, words, tags, heads, labels, ner], - [cats, sent.get("brackets", [])]]) - if sents: - yield [paragraph.get("raw", None), sents] + if i == 0: + sent_starts.append(1) + else: + sent_starts.append(0) + if "brackets" in sent: + brackets.extend((b["first"] + sent_start_i, + b["last"] + sent_start_i, b["label"]) + for b in sent["brackets"]) + cats = {} + for cat in paragraph.get("cats", {}): + cats[cat["label"]] = cat["value"] + example.set_token_annotation(ids=ids, words=words, tags=tags, + pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, + deps=labels, entities=ner, sent_starts=sent_starts, + brackets=brackets) + example.set_doc_annotation(cats=cats) + yield example def read_json_file(loc, docs_filter=None, limit=None): @@ -545,8 +536,8 @@ def read_json_file(loc, docs_filter=None, limit=None): for doc in _json_iterate(loc): if docs_filter is not None and not docs_filter(doc): continue - for json_tuple in json_to_tuple(doc): - yield json_tuple + for json_data in json_to_examples(doc): + yield json_data def _json_iterate(loc): @@ -639,30 +630,334 @@ def _consume_ent(tags): else: start = "B-" + label end = "L-" + label - middle = ["I-%s" % label for _ in range(1, length - 1)] + middle = [f"I-{label}" for _ in range(1, length - 1)] return [start] + middle + [end] +cdef class TokenAnnotation: + def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None, + lemmas=None, heads=None, deps=None, entities=None, sent_starts=None, + brackets=None): + self.ids = ids if ids else [] + self.words = words if words else [] + self.tags = tags if tags else [] + self.pos = pos if pos else [] + self.morphs = morphs if morphs else [] + self.lemmas = lemmas if lemmas else [] + self.heads = heads if heads else [] + self.deps = deps if deps else [] + self.entities = entities if entities else [] + self.sent_starts = sent_starts if sent_starts else [] + self.brackets = brackets if brackets else [] + + @classmethod + def from_dict(cls, token_dict): + return cls(ids=token_dict.get("ids", None), + words=token_dict.get("words", None), + tags=token_dict.get("tags", None), + pos=token_dict.get("pos", None), + morphs=token_dict.get("morphs", None), + lemmas=token_dict.get("lemmas", None), + heads=token_dict.get("heads", None), + deps=token_dict.get("deps", None), + entities=token_dict.get("entities", None), + sent_starts=token_dict.get("sent_starts", None), + brackets=token_dict.get("brackets", None)) + + def to_dict(self): + return {"ids": self.ids, + "words": self.words, + "tags": self.tags, + "pos": self.pos, + "morphs": self.morphs, + "lemmas": self.lemmas, + "heads": self.heads, + "deps": self.deps, + "entities": self.entities, + "sent_starts": self.sent_starts, + "brackets": self.brackets} + + def get_id(self, i): + return self.ids[i] if i < len(self.ids) else i + + def get_word(self, i): + return self.words[i] if i < len(self.words) else "" + + def get_tag(self, i): + return self.tags[i] if i < len(self.tags) else "-" + + def get_pos(self, i): + return self.pos[i] if i < len(self.pos) else "" + + def get_morph(self, i): + return self.morphs[i] if i < len(self.morphs) else "" + + def get_lemma(self, i): + return self.lemmas[i] if i < len(self.lemmas) else "" + + def get_head(self, i): + return self.heads[i] if i < len(self.heads) else i + + def get_dep(self, i): + return self.deps[i] if i < len(self.deps) else "" + + def get_entity(self, i): + return self.entities[i] if i < len(self.entities) else "-" + + def get_sent_start(self, i): + return self.sent_starts[i] if i < len(self.sent_starts) else None + + +cdef class DocAnnotation: + def __init__(self, cats=None, links=None): + self.cats = cats if cats else {} + self.links = links if links else {} + + @classmethod + def from_dict(cls, doc_dict): + return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None)) + + def to_dict(self): + return {"cats": self.cats, "links": self.links} + + +cdef class Example: + def __init__(self, doc_annotation=None, token_annotation=None, doc=None, + goldparse=None): + """ Doc can either be text, or an actual Doc """ + self.doc = doc + self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() + self.token_annotation = token_annotation if token_annotation else TokenAnnotation() + self.goldparse = goldparse + + @classmethod + def from_gold(cls, goldparse, doc=None): + doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) + token_annotation = goldparse.get_token_annotation() + return cls(doc_annotation, token_annotation, doc) + + @classmethod + def from_dict(cls, example_dict, doc=None): + token_dict = example_dict["token_annotation"] + token_annotation = TokenAnnotation.from_dict(token_dict) + doc_dict = example_dict["doc_annotation"] + doc_annotation = DocAnnotation.from_dict(doc_dict) + return cls(doc_annotation, token_annotation, doc) + + def to_dict(self): + """ Note that this method does NOT export the doc, only the annotations ! """ + token_dict = self.token_annotation.to_dict() + doc_dict = self.doc_annotation.to_dict() + return {"token_annotation": token_dict, "doc_annotation": doc_dict} + + @property + def text(self): + if self.doc is None: + return None + if isinstance(self.doc, Doc): + return self.doc.text + return self.doc + + @property + def gold(self): + if self.goldparse is None: + doc, gold = self.get_gold_parses()[0] + self.goldparse = gold + return self.goldparse + + def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, + morphs=None, lemmas=None, heads=None, deps=None, + entities=None, sent_starts=None, brackets=None): + self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, + pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, + deps=deps, entities=entities, + sent_starts=sent_starts, brackets=brackets) + + def set_doc_annotation(self, cats=None, links=None): + if cats: + self.doc_annotation.cats = cats + if links: + self.doc_annotation.links = links + + def split_sents(self): + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] + s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] + s_brackets = [] + sent_start_i = 0 + t = self.token_annotation + split_examples = [] + for i in range(len(t.words)): + if i > 0 and t.sent_starts[i] == 1: + s_example.set_token_annotation(ids=s_ids, + words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, + lemmas=s_lemmas, heads=s_heads, deps=s_deps, + entities=s_ents, sent_starts=s_sent_starts, + brackets=s_brackets) + split_examples.append(s_example) + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] + s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] + s_sent_starts, s_brackets = [], [] + sent_start_i = i + s_ids.append(t.get_id(i)) + s_words.append(t.get_word(i)) + s_tags.append(t.get_tag(i)) + s_pos.append(t.get_pos(i)) + s_morphs.append(t.get_morph(i)) + s_lemmas.append(t.get_lemma(i)) + s_heads.append(t.get_head(i) - sent_start_i) + s_deps.append(t.get_dep(i)) + s_ents.append(t.get_entity(i)) + s_sent_starts.append(t.get_sent_start(i)) + s_brackets.extend((b[0] - sent_start_i, + b[1] - sent_start_i, b[2]) + for b in t.brackets if b[0] == i) + i += 1 + s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, + pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, + deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, + brackets=s_brackets) + split_examples.append(s_example) + return split_examples + + + def get_gold_parses(self, merge=True, vocab=None, make_projective=False, + ignore_misaligned=False): + """Return a list of (doc, GoldParse) objects. + If merge is set to True, keep all Token annotations as one big list.""" + d = self.doc_annotation + # merge == do not modify Example + if merge: + t = self.token_annotation + doc = self.doc + if not self.doc: + if not vocab: + raise ValueError(Errors.E998) + doc = Doc(vocab, words=t.words) + try: + gp = GoldParse.from_annotation(doc, d, t, + make_projective=make_projective) + except AlignmentError: + if ignore_misaligned: + gp = None + else: + raise + return [(doc, gp)] + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence + else: + parses = [] + split_examples = self.split_sents() + for split_example in split_examples: + if not vocab: + raise ValueError(Errors.E998) + split_doc = Doc(vocab, words=split_example.token_annotation.words) + try: + gp = GoldParse.from_annotation(split_doc, d, + split_example.token_annotation, + make_projective=make_projective) + except AlignmentError: + if ignore_misaligned: + gp = None + else: + raise + if gp is not None: + parses.append((split_doc, gp)) + return parses + + @classmethod + def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): + """ + Return a list of Example objects, from a variety of input formats. + make_doc needs to be provided when the examples contain text strings and keep_raw_text=False + """ + if isinstance(examples, Example): + return [examples] + if isinstance(examples, tuple): + examples = [examples] + converted_examples = [] + for ex in examples: + # convert string to Doc to Example + if isinstance(ex, str): + if keep_raw_text: + converted_examples.append(Example(doc=ex)) + else: + doc = make_doc(ex) + converted_examples.append(Example(doc=doc)) + # convert Doc to Example + elif isinstance(ex, Doc): + converted_examples.append(Example(doc=ex)) + # convert tuples to Example + elif isinstance(ex, tuple) and len(ex) == 2: + doc, gold = ex + gold_dict = {} + # convert string to Doc + if isinstance(doc, str) and not keep_raw_text: + doc = make_doc(doc) + # convert dict to GoldParse + if isinstance(gold, dict): + gold_dict = gold + if doc is not None or gold.get("words", None) is not None: + gold = GoldParse(doc, **gold) + else: + gold = None + if gold is not None: + converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) + else: + raise ValueError(Errors.E999.format(gold_dict=gold_dict)) + else: + converted_examples.append(ex) + return converted_examples + + cdef class GoldParse: """Collection for training annotations. DOCS: https://spacy.io/api/goldparse """ @classmethod - def from_annot_tuples(cls, doc, annot_tuples, cats=None, make_projective=False): - _, words, tags, heads, deps, entities = annot_tuples - return cls(doc, words=words, tags=tags, heads=heads, deps=deps, - entities=entities, cats=cats, + def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): + return cls(doc, words=token_annotation.words, + tags=token_annotation.tags, + pos=token_annotation.pos, + morphs=token_annotation.morphs, + lemmas=token_annotation.lemmas, + heads=token_annotation.heads, + deps=token_annotation.deps, + entities=token_annotation.entities, + sent_starts=token_annotation.sent_starts, + cats=doc_annotation.cats, + links=doc_annotation.links, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, - heads=None, deps=None, entities=None, make_projective=False, - cats=None, links=None, **_): + def get_token_annotation(self): + ids = None + if self.words: + ids = list(range(len(self.words))) + + return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, + pos=self.pos, morphs=self.morphs, + lemmas=self.lemmas, heads=self.heads, + deps=self.labels, entities=self.ner, + sent_starts=self.sent_starts) + + def __init__(self, doc, words=None, tags=None, pos=None, morphs=None, + lemmas=None, heads=None, deps=None, entities=None, + sent_starts=None, make_projective=False, cats=None, + links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. doc (Doc): The document the annotations refer to. words (iterable): A sequence of unicode word strings. tags (iterable): A sequence of strings, representing tag annotations. + pos (iterable): A sequence of strings, representing UPOS annotations. + morphs (iterable): A sequence of strings, representing morph + annotations. + lemmas (iterable): A sequence of strings, representing lemma + annotations. heads (iterable): A sequence of integers, representing syntactic head offsets. deps (iterable): A sequence of strings, representing the syntactic @@ -670,6 +965,8 @@ cdef class GoldParse: entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. + sent_starts (iterable): A sequence of sentence position tags, 1 for + the first word in a sentence, 0 for all others. cats (dict): Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the @@ -692,25 +989,26 @@ cdef class GoldParse: self.length = len(doc) self.cats = {} if cats is None else dict(cats) - self.links = links - - # orig_annot is used as an iterator in `nlp.evalate` even if self.length == 0, - # so set a empty list to avoid error. - # if self.lenght > 0, this is modified latter. - self.orig_annot = [] + self.links = {} if links is None else dict(links) # avoid allocating memory if the doc does not contain any tokens if self.length > 0: - if words is None: + if not words: words = [token.text for token in doc] - if tags is None: + if not tags: tags = [None for _ in words] - if heads is None: + if not pos: + pos = [None for _ in words] + if not morphs: + morphs = [None for _ in words] + if not lemmas: + lemmas = [None for _ in words] + if not heads: heads = [None for _ in words] - if deps is None: + if not deps: deps = [None for _ in words] - if morphology is None: - morphology = [None for _ in words] + if not sent_starts: + sent_starts = [None for _ in words] if entities is None: entities = ["-" for _ in words] elif len(entities) == 0: @@ -719,7 +1017,7 @@ cdef class GoldParse: # Translate the None values to '-', to make processing easier. # See Issue #2603 entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], basestring): + if not isinstance(entities[0], str): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -733,10 +1031,13 @@ cdef class GoldParse: self.words = [None] * len(doc) self.tags = [None] * len(doc) + self.pos = [None] * len(doc) + self.morphs = [None] * len(doc) + self.lemmas = [None] * len(doc) self.heads = [None] * len(doc) self.labels = [None] * len(doc) self.ner = [None] * len(doc) - self.morphology = [None] * len(doc) + self.sent_starts = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -754,22 +1055,30 @@ cdef class GoldParse: self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - annot_tuples = (range(len(words)), words, tags, heads, deps, entities) - self.orig_annot = list(zip(*annot_tuples)) + self.orig = TokenAnnotation(ids=list(range(len(words))), + words=words, tags=tags, pos=pos, morphs=morphs, + lemmas=lemmas, heads=heads, deps=deps, entities=entities, + sent_starts=sent_starts, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): if doc[i].text.isspace(): self.words[i] = doc[i].text self.tags[i] = "_SP" + self.pos[i] = "SPACE" + self.morphs[i] = None + self.lemmas[i] = None self.heads[i] = None self.labels[i] = None self.ner[i] = None - self.morphology[i] = set() + self.sent_starts[i] = 0 if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] - self.morphology[i] = morphology[i2j_multi[i]] + self.pos[i] = pos[i2j_multi[i]] + self.morphs[i] = morphs[i2j_multi[i]] + self.lemmas[i] = lemmas[i2j_multi[i]] + self.sent_starts[i] = sent_starts[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last @@ -808,7 +1117,10 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] - self.morphology[i] = morphology[gold_i] + self.pos[i] = pos[gold_i] + self.morphs[i] = morphs[gold_i] + self.lemmas[i] = lemmas[gold_i] + self.sent_starts[i] = sent_starts[gold_i] if heads[gold_i] is None: self.heads[i] = None else: @@ -828,7 +1140,7 @@ cdef class GoldParse: cycle = nonproj.contains_cycle(self.heads) if cycle is not None: raise ValueError(Errors.E069.format(cycle=cycle, - cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), + cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) def __len__(self): @@ -845,21 +1157,6 @@ cdef class GoldParse: """ return not nonproj.is_nonproj_tree(self.heads) - property sent_starts: - def __get__(self): - return [self.c.sent_start[i] for i in range(self.length)] - - def __set__(self, sent_starts): - for gold_i, is_sent_start in enumerate(sent_starts): - i = self.gold_to_cand[gold_i] - if i is not None: - if is_sent_start in (1, True): - self.c.sent_start[i] = 1 - elif is_sent_start in (-1, False): - self.c.sent_start[i] = -1 - else: - self.c.sent_start[i] = 0 - def docs_to_json(docs, id=0, ner_missing_tag="O"): """Convert a list of Doc objects into the JSON-serializable format used by @@ -886,6 +1183,9 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"): json_token = {"id": token.i, "orth": token.text} if doc.is_tagged: json_token["tag"] = token.tag_ + json_token["pos"] = token.pos_ + json_token["morph"] = token.morph_ + json_token["lemma"] = token.lemma_ if doc.is_parsed: json_token["head"] = token.head.i-token.i json_token["dep"] = token.dep_ @@ -943,12 +1243,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): # Only interested if the tokenization is correct if start_token is not None and end_token is not None: if start_token == end_token: - biluo[start_token] = "U-%s" % label + biluo[start_token] = f"U-{label}" else: - biluo[start_token] = "B-%s" % label + biluo[start_token] = f"B-{label}" for i in range(start_token+1, end_token): - biluo[i] = "I-%s" % label - biluo[end_token] = "L-%s" % label + biluo[i] = f"I-{label}" + biluo[end_token] = f"L-{label}" # Now distinguish the O cases from ones where we miss the tokenization entity_chars = set() for start_char, end_char, label in entities: diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d5aa382b1..518ce0f4e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from libc.stdio cimport FILE -from spacy.vocab cimport Vocab +from .vocab cimport Vocab from .typedefs cimport hash_t from .structs cimport KBEntryC, AliasC @@ -113,7 +113,7 @@ cdef class KnowledgeBase: return new_index cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: - """ + """ Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 @@ -169,4 +169,3 @@ cdef class Reader: cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int _read(self, void* value, size_t size) except -1 - diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..64fbb1e29 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,22 +1,17 @@ # cython: infer_types=True # cython: profile=True -# coding: utf8 -from spacy.errors import Errors, Warnings, user_warning - from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from cpython.exc cimport PyErr_SetFromErrno - from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t - -from .typedefs cimport hash_t - from os import path from libcpp.vector cimport vector +from .typedefs cimport hash_t +from .errors import Errors, Warnings, user_warning + cdef class Candidate: """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved @@ -447,7 +442,7 @@ cdef class KnowledgeBase: cdef class Writer: def __init__(self, object loc): if path.exists(loc): - assert not path.isdir(loc), "%s is directory." % loc + assert not path.isdir(loc), f"{loc} is directory" if isinstance(loc, Path): loc = bytes(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc @@ -584,5 +579,3 @@ cdef class Reader: cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status - - diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py index 90ea324f0..0da123419 100644 --- a/spacy/lang/af/__init__.py +++ b/spacy/lang/af/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index 2b3bcc019..4b5a04a5e 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-af STOP_WORDS = set( diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index c120703f6..6a1a8af3a 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ar/examples.py b/spacy/lang/ar/examples.py index 2a10f4fcc..a51bb9ded 100644 --- a/spacy/lang/ar/examples.py +++ b/spacy/lang/ar/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py index 19e7aef8a..54ad7a8c3 100644 --- a/spacy/lang/ar/lex_attrs.py +++ b/spacy/lang/ar/lex_attrs.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...attrs import LIKE_NUM _num_words = set( diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py index 6625c5475..f30204c02 100644 --- a/spacy/lang/ar/punctuation.py +++ b/spacy/lang/ar/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py index de2fc7443..f4da54dda 100644 --- a/spacy/lang/ar/stop_words.py +++ b/spacy/lang/ar/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ من diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py index 030daecd5..a11f3b43a 100644 --- a/spacy/lang/ar/tokenizer_exceptions.py +++ b/spacy/lang/ar/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index 9b4c647e3..437feb9ed 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/bg/examples.py b/spacy/lang/bg/examples.py index b08b8926d..a6d40da1a 100644 --- a/spacy/lang/bg/examples.py +++ b/spacy/lang/bg/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index e7c65cbc2..aae7692a2 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/Alir3z4/stop-words STOP_WORDS = set( diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index e70232552..901676554 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py index 2d5bdb238..c3be4c556 100644 --- a/spacy/lang/bn/examples.py +++ b/spacy/lang/bn/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bn/morph_rules.py b/spacy/lang/bn/morph_rules.py index 21a76c7e6..44d6108e9 100644 --- a/spacy/lang/bn/morph_rules.py +++ b/spacy/lang/bn/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index f624b4ba4..becfe8d2a 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 6c9967df8..bf38e3254 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে diff --git a/spacy/lang/bn/tag_map.py b/spacy/lang/bn/tag_map.py index 1efb35858..bc4c5ef6b 100644 --- a/spacy/lang/bn/tag_map.py +++ b/spacy/lang/bn/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM @@ -14,8 +11,8 @@ TAG_MAP = { '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, ":": {POS: PUNCT}, - "৳": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "৳": {POS: SYM, "SymType": "currency"}, + "#": {POS: SYM, "SymType": "numbersign"}, "AFX": {POS: ADJ, "Hyph": "yes"}, "CC": {POS: CONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index 32acb1730..18e313a25 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding=utf-8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 6d4c00a6b..a1ff2f2df 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py index 3020ee707..ae6aa3e24 100644 --- a/spacy/lang/ca/examples.py +++ b/spacy/lang/ca/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py index 6314efa92..be8b7a6ea 100644 --- a/spacy/lang/ca/lex_attrs.py +++ b/spacy/lang/ca/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py index 4439376c8..d50b75589 100644 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py index a803db2a5..1a87b2f9d 100644 --- a/spacy/lang/ca/stop_words.py +++ b/spacy/lang/ca/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò diff --git a/spacy/lang/ca/tag_map.py b/spacy/lang/ca/tag_map.py deleted file mode 100644 index 472e772ef..000000000 --- a/spacy/lang/ca/tag_map.py +++ /dev/null @@ -1,28 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE}, -} diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index d95e5e626..b4ae61a2d 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA @@ -33,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index bd0f7e437..b8094319f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - split_chars = lambda char: list(char.strip().split(" ")) merge_chars = lambda char: char.strip().replace(" ", "|") group_chars = lambda char: char.strip().replace(" ", "") diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py index 5b1397ba2..a27e3339d 100644 --- a/spacy/lang/cs/__init__.py +++ b/spacy/lang/cs/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index 59d3c102e..70aab030b 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/Alir3z4/stop-words STOP_WORDS = set( diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index ac8c04954..6d1e33986 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES -from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -27,7 +23,6 @@ class DanishDefaults(Language.Defaults): morph_rules = MORPH_RULES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index b535191a1..80b2b925b 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py index 9fefc1eba..403af686c 100644 --- a/spacy/lang/da/lex_attrs.py +++ b/spacy/lang/da/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py index 7ffe2ac6f..06704f482 100644 --- a/spacy/lang/da/morph_rules.py +++ b/spacy/lang/da/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py index dbffdb88b..c689500f4 100644 --- a/spacy/lang/da/norm_exceptions.py +++ b/spacy/lang/da/norm_exceptions.py @@ -1,10 +1,7 @@ -# coding: utf8 """ Special-case rules for normalizing tokens to improve the model's predictions. For example 'mysterium' vs 'mysterie' and similar. """ -from __future__ import unicode_literals - # Sources: # 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/ diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py index b6b852c55..e050ab7aa 100644 --- a/spacy/lang/da/punctuation.py +++ b/spacy/lang/da/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index 48de0c7ca..05b2084dd 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: Handpicked by Jens Dahl Møllerhøj. STOP_WORDS = set( diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index d669fb981..c8ea9cbf5 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,11 +1,7 @@ -# encoding: utf8 """ Tokenizer Exceptions. Source: https://forkortelse.dk/ and various others. """ - -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT @@ -563,7 +559,7 @@ for exc_data in [ # Dates for h in range(1, 31 + 1): for period in ["."]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] _custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]} _exc.update(_custom_base_exc) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1412f033a..8478b6f23 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py index 0c64a693a..735d1c316 100644 --- a/spacy/lang/de/examples.py +++ b/spacy/lang/de/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py index 3dbd4c7e3..6ad5b62a7 100644 --- a/spacy/lang/de/norm_exceptions.py +++ b/spacy/lang/de/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Here we only want to include the absolute most common words. Otherwise, # this list would get impossibly long for German – especially considering the # old vs. new spelling rules, and all possible cases. diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index 7dfa61bd4..72f7e1022 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index cf3204d5e..f52687eb9 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ á a ab aber ach acht achte achten achter achtes ag alle allein allem allen @@ -22,14 +18,14 @@ dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft durfte durften eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine -einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en +einem einen einer eines einige einigen einiger einiges einmal einmaleins elf en ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch früher fünf fünfte fünften fünfter fünftes für gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige -gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen +gewesen gewollt geworden gibt ging gleich gross groß grosse große grossen großen grosser großer grosses großes gut gute guter gutes habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier @@ -47,9 +43,8 @@ kleines kommen kommt können könnt konnte könnte konnten kurz lang lange leicht leider lieber los machen macht machte mag magst man manche manchem manchen mancher manches mehr -mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel -mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst -musste mussten +mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten +mögen möglich mögt morgen muss muß müssen musst müsst musste mussten na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter neuntes nicht nichts nie niemand niemandem niemanden noch nun nur diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 89d784a0c..410d2f0b4 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index c169501a9..ca7ec61f1 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 5b09a0b89..3dd8507bc 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 6d551cc4e..95920a68f 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ..tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/el/examples.py b/spacy/lang/el/examples.py index 521e7b30d..62515c07a 100644 --- a/spacy/lang/el/examples.py +++ b/spacy/lang/el/examples.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.el.examples import sentences diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py index f41833974..369973cc0 100644 --- a/spacy/lang/el/get_pos_from_wiktionary.py +++ b/spacy/lang/el/get_pos_from_wiktionary.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - def get_pos_from_wiktionary(): import re from gensim.corpora.wikicorpus import extract_pages diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 6f5b3999b..cf3a7fe97 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer diff --git a/spacy/lang/el/lex_attrs.py b/spacy/lang/el/lex_attrs.py index cf32fe12c..5c8f96848 100644 --- a/spacy/lang/el/lex_attrs.py +++ b/spacy/lang/el/lex_attrs.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py index d4384ff3c..aa774c19b 100644 --- a/spacy/lang/el/norm_exceptions.py +++ b/spacy/lang/el/norm_exceptions.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # These exceptions are used to add NORM values based on a token's ORTH value. # Norms are only set if no alternative is provided in the tokenizer exceptions. diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py index fbf773f4d..2d5690407 100644 --- a/spacy/lang/el/punctuation.py +++ b/spacy/lang/el/punctuation.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS from ..char_classes import CONCAT_QUOTES, CURRENCY diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index f13c47ec2..7c436219f 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 STOP_WORDS = set( diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5dfd44f07..988a36c80 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/el/tag_map_fine.py b/spacy/lang/el/tag_map_fine.py index b346299bc..f37f84c57 100644 --- a/spacy/lang/el/tag_map_fine.py +++ b/spacy/lang/el/tag_map_fine.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX @@ -659,7 +656,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Plur", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFePlGe": { POS: DET, @@ -667,7 +664,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Plur", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFePlNm": { POS: DET, @@ -675,7 +672,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Plur", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgAc": { POS: DET, @@ -683,7 +680,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgDa": { POS: DET, @@ -691,7 +688,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgGe": { POS: DET, @@ -699,7 +696,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgNm": { POS: DET, @@ -707,7 +704,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaPlAc": { POS: DET, @@ -715,7 +712,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Plur", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaPlGe": { POS: DET, @@ -723,7 +720,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Plur", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaPlNm": { POS: DET, @@ -731,7 +728,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Plur", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgAc": { POS: DET, @@ -739,7 +736,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgDa": { POS: DET, @@ -747,7 +744,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgGe": { POS: DET, @@ -755,7 +752,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgNm": { POS: DET, @@ -763,7 +760,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlAc": { POS: DET, @@ -771,7 +768,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlDa": { POS: DET, @@ -779,7 +776,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlGe": { POS: DET, @@ -787,7 +784,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlNm": { POS: DET, @@ -795,7 +792,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgAc": { POS: DET, @@ -803,7 +800,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgDa": { POS: DET, @@ -811,7 +808,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgGe": { POS: DET, @@ -819,7 +816,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgNm": { POS: DET, @@ -827,7 +824,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtIdFeSgAc": { POS: DET, @@ -835,7 +832,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdFeSgDa": { POS: DET, @@ -843,7 +840,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdFeSgGe": { POS: DET, @@ -851,7 +848,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdFeSgNm": { POS: DET, @@ -859,7 +856,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdMaSgAc": { POS: DET, @@ -867,7 +864,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdMaSgGe": { POS: DET, @@ -875,7 +872,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdMaSgNm": { POS: DET, @@ -883,7 +880,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdNeSgAc": { POS: DET, @@ -891,7 +888,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdNeSgGe": { POS: DET, @@ -899,7 +896,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdNeSgNm": { POS: DET, @@ -907,7 +904,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "CjCo": {POS: CCONJ}, "CjSb": {POS: SCONJ}, diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index a3c36542e..112fd991b 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM @@ -134,14 +131,14 @@ _exc.update(_other_exc) for h in range(1, 12 + 1): for period in ["π.μ.", "πμ"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."}, ] for period in ["μ.μ.", "μμ"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."}, ] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index fca4e01e7..fa01e2b60 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py index 946289c7c..2cca9e05f 100644 --- a/spacy/lang/en/examples.py +++ b/spacy/lang/en/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py index f92d41139..96fb4c9fa 100644 --- a/spacy/lang/en/lex_attrs.py +++ b/spacy/lang/en/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index 5ed4eac59..aa3e6ce57 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # Several entries here look pretty suspicious. These will get the POS SCONJ diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py index a2cf58b8a..4125cd37b 100644 --- a/spacy/lang/en/norm_exceptions.py +++ b/spacy/lang/en/norm_exceptions.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - _exc = { # Slang and abbreviations "cos": "because", diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 3505b13bf..1ca5cbc16 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Stop words STOP_WORDS = set( """ diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index ed665ef29..86695cf6f 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index ecb3103cc..2078798f7 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index c45197771..3e8075ec4 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA @@ -331,13 +328,13 @@ for exc_data in [ for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}, ] for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}, ] diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 80cc1727c..060bd8fc6 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..a1db41a16 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 03ada1f43..d2a3c891a 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 20e929b48..004df4fca 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 6a78d86f7..e998cd1d6 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON, VERB, AUX diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 7a7c9d549..1748162c0 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 9109d658b..5c7fcb15d 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA @@ -31,9 +28,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] for orth in [ diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py index d84c081ef..e0b0a8a87 100644 --- a/spacy/lang/et/__init__.py +++ b/spacy/lang/et/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index 15070db5f..e1da1f14d 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-et STOP_WORDS = set( diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 9d85f814a..aa02855e9 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py index 3f65a366d..9c6fb0345 100644 --- a/spacy/lang/fa/examples.py +++ b/spacy/lang/fa/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index 5d0ff944d..62094c6de 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - verb_roots = """ #هست آخت#آهنج diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py index dbea66b68..99b8e2787 100644 --- a/spacy/lang/fa/lex_attrs.py +++ b/spacy/lang/fa/lex_attrs.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...attrs import LIKE_NUM diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py index 33aa46ae2..4b258c13d 100644 --- a/spacy/lang/fa/punctuation.py +++ b/spacy/lang/fa/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index 682fb7a71..f462f2e7a 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Stop words from HAZM package STOP_WORDS = set( """ diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index ed665ef29..86695cf6f 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/fa/tag_map.py b/spacy/lang/fa/tag_map.py index b9043adf0..f1f106915 100644 --- a/spacy/lang/fa/tag_map.py +++ b/spacy/lang/fa/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import PRON, NOUN, PART, INTJ, AUX diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py index b3f8dcbf5..db9e3f6fc 100644 --- a/spacy/lang/fa/tokenizer_exceptions.py +++ b/spacy/lang/fa/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 45d2f886f..db58ad3ba 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/fi/examples.py b/spacy/lang/fi/examples.py index 88be248a6..930fac273 100644 --- a/spacy/lang/fi/examples.py +++ b/spacy/lang/fi/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.fi.examples import sentences diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py index e960b55eb..4d500cead 100644 --- a/spacy/lang/fi/lex_attrs.py +++ b/spacy/lang/fi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py index a85c0b228..6e14dde38 100644 --- a/spacy/lang/fi/punctuation.py +++ b/spacy/lang/fi/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index e8e39ec6f..8e8dcfa56 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections STOP_WORDS = set( diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 5469e345e..b166bf420 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f56c8688a..dc45e538c 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py index c9fcfff2d..7f908dac8 100644 --- a/spacy/lang/fr/_tokenizer_exceptions_list.py +++ b/spacy/lang/fr/_tokenizer_exceptions_list.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - FR_BASE_EXCEPTIONS = [ "(+)-amphétamine", "(5R,6S)-7,8-didehydro-4,5-époxy-3-méthoxy-N-méthylmorphinan-6-ol", diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py index a874c22fc..a74a62204 100644 --- a/spacy/lang/fr/examples.py +++ b/spacy/lang/fr/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index 79f4dd28d..fe128df1f 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,10 +1,6 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import SCONJ, CCONJ -from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class FrenchLemmatizer(Lemmatizer): @@ -85,13 +81,13 @@ class FrenchLemmatizer(Lemmatizer): return True elif univ_pos == "adj" and morphology.get("Degree") == "pos": return True - elif VerbForm_inf in morphology: + elif "VerbForm=inf" in morphology: return True - elif VerbForm_none in morphology: + elif "VerbForm=none" in morphology: return True - elif Number_sing in morphology: + elif "Number=sing" in morphology: return True - elif Degree_pos in morphology: + elif "Degree=pos" in morphology: return True else: return False diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py index e3ccd9fdd..da98c6e37 100644 --- a/spacy/lang/fr/lex_attrs.py +++ b/spacy/lang/fr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..5f42e7f25 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ae8432043..a331f3c0f 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/fr/tag_map.py b/spacy/lang/fr/tag_map.py index 93b43c2ec..2b1b20c52 100644 --- a/spacy/lang/fr/tag_map.py +++ b/spacy/lang/fr/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4b3b2c908..4e2e7fb18 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from .punctuation import ELISION, HYPHENS @@ -70,7 +67,7 @@ for verb, verb_lemma in [ ]: for orth in [verb, verb.title()]: for pronoun in ["elle", "il", "on"]: - token = "{}-t-{}".format(orth, pronoun) + token = f"{orth}-t-{pronoun}" _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, @@ -79,7 +76,7 @@ for verb, verb_lemma in [ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: - token = "{}-ce".format(orth) + token = f"{orth}-ce" _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, @@ -88,7 +85,7 @@ for verb, verb_lemma in [("est", "être")]: for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for orth in [pre, pre.title()]: - _exc["%sest-ce" % orth] = [ + _exc[f"{orth}est-ce"] = [ {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, {LEMMA: "être", ORTH: "est", TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 42b4d0d18..4c3d219c7 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,8 +1,6 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language @@ -16,6 +14,7 @@ class IrishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) + tag_map = TAG_MAP class Irish(Language): diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index 2133f0d22..d606da975 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # fmt: off consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] broad_vowels = ["a", "á", "o", "ó", "u", "ú"] diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py index d8f705b59..4ef052ca5 100644 --- a/spacy/lang/ga/stop_words.py +++ b/spacy/lang/ga/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a ach ag agus an aon ar arna as diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py index 1d8284014..efcaf5d1f 100644 --- a/spacy/lang/ga/tag_map.py +++ b/spacy/lang/ga/tag_map.py @@ -1,29 +1,26 @@ -# coding: utf8 -from __future__ import unicode_literals - # fmt: off TAG_MAP = { - "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"}, "ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"}, "ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"}, - "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}}, - "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}}, - "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "NounType": "strong"}, + "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "NounType": "weak"}, + "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"}, "ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, "ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, "ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, "ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, - "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}}, - "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}}, - "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "NounType": "notslender"}, + "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "NounType": "slender"}, + "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Form": "len"}, "ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"}, - "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}}, - "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}}, - "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}}, + "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Form": "ecl"}, + "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Form": "hpref"}, + "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Form": "len"}, "ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"}, "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, - "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, + "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Form": "len"}, "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, "ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, @@ -32,9 +29,9 @@ TAG_MAP = { "ADJ___": {"pos": "ADJ"}, "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, "ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"}, - "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}}, - "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, - "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Form": "len"}, + "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Form": "len"}, + "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Form": "len"}, "ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3}, "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, @@ -60,41 +57,41 @@ TAG_MAP = { "ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"}, "ADP___": {"pos": "ADP"}, "ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"}, - "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}}, + "ADP__PrepForm=Cmpd": {"pos": "ADP", "PrepForm": "cmpd"}, "ADP__PronType=Art": {"pos": "ADP", "PronType": "art"}, - "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}}, + "ADV__Form=Len": {"pos": "ADV", "Form": "len"}, "ADV___": {"pos": "ADV"}, "ADV__PronType=Int": {"pos": "ADV", "PronType": "int"}, - "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}}, - "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}}, - "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, - "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Form": "vf", "VerbForm": "cop"}, + "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "VerbForm": "cop"}, + "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Mood": "int", "VerbForm": "cop"}, + "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Mood": "int", "VerbForm": "cop"}, + "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Mood": "int", "VerbForm": "cop"}, + "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Mood": "int", "VerbForm": "cop"}, + "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "PartType": "comp", "VerbForm": "cop"}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "VerbForm": "cop"}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "VerbForm": "cop"}, + "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "VerbForm": "cop"}, + "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "VerbForm": "cop"}, "AUX___": {"pos": "AUX"}, - "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}}, - "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, - "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}}, - "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "VerbForm": "cop"}, + "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "VerbForm": "cop"}, + "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "VerbForm": "cop"}, + "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "VerbForm": "cop"}, + "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "VerbForm": "cop"}, + "AUX__VerbForm=Cop": {"pos": "AUX", "VerbForm": "cop"}, "CCONJ___": {"pos": "CCONJ"}, "DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, - "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}}, + "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Form": "ecl"}, "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, "DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"}, "DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"}, "DET__Definite=Def": {"pos": "DET", "Definite": "def"}, - "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}}, + "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Form": "hpref"}, "DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, "DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, "DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"}, @@ -106,33 +103,33 @@ TAG_MAP = { "DET__PronType=Dem": {"pos": "DET", "PronType": "dem"}, "DET__PronType=Ind": {"pos": "DET", "PronType": "ind"}, "NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Form": "len"}, "NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"}, "NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"}, - "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "NounType": "strong"}, "NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, - "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "NounType": "strong"}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "NounType": "weak"}, "NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"}, "NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, - "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}}, - "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}}, - "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}}, - "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, - "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Form": "ecl", "NounType": "strong"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl", "NounType": "strong"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl", "NounType": "weak"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "hpref"}, + "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "len"}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "len", "NounType": "strong"}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "len", "NounType": "weak"}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"}, + "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Form": "len"}, + "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "NounType": "strong"}, + "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "NounType": "weak"}, "NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"}, "NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, - "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "strong"}, + "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "weak"}, "NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"}, "NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"}, "NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"}, @@ -143,79 +140,79 @@ TAG_MAP = { "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, "NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "emp"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "len"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "len"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"}, "NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, "NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, "NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, "NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, "NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"}, - "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, - "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Form": "len"}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Form": "len"}, "NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"}, "NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"}, "NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"}, - "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}}, - "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}}, - "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}}, - "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Form": "ecl"}, + "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "ecl"}, + "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Form": "ecl"}, + "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "hpref"}, + "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Form": "len"}, + "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "len"}, "NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"}, - "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}}, + "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "PartType": "comp"}, "NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"}, "NOUN___": {"pos": "NOUN"}, "NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"}, "NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"}, "NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"}, "NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"}, - "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}}, - "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}}, - "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}}, - "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}}, - "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}}, + "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "ecl"}, + "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Form": "ecl"}, + "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "hpref"}, + "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "len"}, + "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Form": "len"}, "NUM__NumType=Card": {"pos": "NUM", "NumType": "card"}, "NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"}, "NUM___": {"pos": "NUM"}, - "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}}, - "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}}, - "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}}, - "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}}, - "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}}, - "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}}, - "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}}, - "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}}, - "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}}, - "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}}, - "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}}, + "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Form": "ecl", "PartType": "vb"}, + "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "PartType": "vb"}, + "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "PartType": "vb"}, + "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Mood": "int", "PartType": "vb"}, + "PART__PartType=Ad": {"pos": "PART", "PartType": "ad"}, + "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "PartType": "cmpl"}, + "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "PartType": "cmpl"}, + "PART__PartType=Cmpl": {"pos": "PART", "PartType": "cmpl"}, + "PART__PartType=Comp": {"pos": "PART", "PartType": "comp"}, + "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "PartType": "cop"}, + "PART__PartType=Deg": {"pos": "PART", "PartType": "deg"}, "PART__PartType=Inf": {"pos": "PART", "PartType": "inf"}, - "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}}, - "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}}, - "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}}, + "PART__PartType=Num": {"pos": "PART", "PartType": "num"}, + "PART__PartType=Pat": {"pos": "PART", "PartType": "pat"}, + "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "PartType": "vb"}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "PartType": "vb"}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Vb": {"pos": "PART", "PartType": "vb"}, + "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "PartType": "vb"}, + "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Voc": {"pos": "PART", "PartType": "voc"}, "PART___": {"pos": "PART"}, "PART__PronType=Rel": {"pos": "PART", "PronType": "rel"}, - "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, - "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}}, + "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Form": "len"}, + "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Form": "len"}, "PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3}, "PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3}, "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, @@ -235,103 +232,103 @@ TAG_MAP = { "PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"}, "PRON__PronType=Int": {"pos": "PRON", "PronType": "int"}, "PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"}, - "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}}, + "PROPN__Abbr=Yes": {"pos": "PROPN", "Abbr": "yes"}, "PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"}, "PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, - "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, - "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, - "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, - "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}}, - "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, - "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Form": "ecl"}, + "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl"}, + "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "hpref"}, + "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "len"}, + "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Form": "len"}, + "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"}, + "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Form": "len"}, "PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"}, "PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"}, - "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "weak"}, "PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"}, "PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"}, "PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, - "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, - "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, - "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "ecl"}, + "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "hpref"}, + "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"}, "PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, "PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, "PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, "PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"}, - "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Form": "len"}, "PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"}, "PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"}, "PROPN___": {"pos": "PROPN"}, "PUNCT___": {"pos": "PUNCT"}, "SCONJ___": {"pos": "SCONJ"}, - "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}}, - "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}}, + "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "VerbForm": "cop"}, + "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "VerbForm": "cop"}, + "SYM__Abbr=Yes": {"pos": "SYM", "Abbr": "yes"}, "VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"}, - "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Dialect": "munster", "Form": "len"}, "VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"}, - "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}}, - "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, - "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}}, - "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, - "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}}, - "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}}, + "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl": {"pos": "VERB", "Form": "ecl"}, + "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Form": "emp"}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Form": "emp"}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Form": "emp"}, + "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Form": "len"}, + "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Form": "len"}, + "VERB__Form=Len": {"pos": "VERB", "Form": "len"}, "VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3}, "VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1}, "VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"}, - "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}}, + "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Voice": "auto"}, "VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"}, "VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1}, "VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2}, @@ -341,28 +338,28 @@ TAG_MAP = { "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"}, "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"}, "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"}, - "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Voice": "auto"}, "VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"}, "VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"}, "VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"}, "VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"}, - "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Voice": "auto"}, "VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"}, - "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Voice": "auto"}, "VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"}, - "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Voice": "auto"}, "VERB___": {"pos": "VERB"}, - "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}}, + "X__Abbr=Yes": {"pos": "X", "Abbr": "yes"}, "X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"}, - "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}}, - "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, - "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}}, - "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}}, - "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}}, - "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}}, - "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}}, - "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}}, - "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, + "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Dialect": "ulster"}, + "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Dialect": "munster", "Form": "len"}, + "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Dialect": "munster"}, + "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Dialect": "munster", "Voice": "auto"}, + "X__Dialect=Munster": {"pos": "X", "Dialect": "munster"}, + "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Dialect": "munster"}, + "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Dialect": "ulster"}, + "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Dialect": "ulster", "PartType": "vb"}, + "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Dialect": "ulster", "VerbForm": "cop"}, "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, "X___": {"pos": "X"} } diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index c0e53f522..0c587c67e 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index 411cdf107..0d324f64c 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py index 34cd157ae..d54d2a145 100644 --- a/spacy/lang/he/examples.py +++ b/spacy/lang/he/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py index a01ec4246..2745460a7 100644 --- a/spacy/lang/he/stop_words.py +++ b/spacy/lang/he/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ אני diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py index b0d45ddf3..9a96de95c 100644 --- a/spacy/lang/hi/__init__.py +++ b/spacy/lang/hi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py index 1dd182532..ecb0b328c 100644 --- a/spacy/lang/hi/examples.py +++ b/spacy/lang/hi/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py index 12666d96a..20a8c2975 100644 --- a/spacy/lang/hi/lex_attrs.py +++ b/spacy/lang/hi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..norm_exceptions import BASE_NORMS from ...attrs import NORM, LIKE_NUM diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index efad18c84..475b07da1 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 STOP_WORDS = set( diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 539b164d7..fbc66ece0 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/hr/examples.py b/spacy/lang/hr/examples.py index dc52ce4f0..b28fb63c2 100644 --- a/spacy/lang/hr/examples.py +++ b/spacy/lang/hr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py index 408b802c5..dd10f792d 100644 --- a/spacy/lang/hr/stop_words.py +++ b/spacy/lang/hr/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-hr STOP_WORDS = set( """ diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index a331adc5b..df3fe4a44 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py index 3267887fe..711a438bd 100644 --- a/spacy/lang/hu/examples.py +++ b/spacy/lang/hu/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..1fea6d510 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index c9a217dd6..e39a26d35 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..cc5eede17 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from ..punctuation import ALPHA_LOWER, CURRENCY diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ea8e355ac..89f874abe 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py index fec878d5a..a0b35fa1a 100644 --- a/spacy/lang/id/_tokenizer_exceptions_list.py +++ b/spacy/lang/id/_tokenizer_exceptions_list.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - ID_BASE_EXCEPTIONS = set( """ aba-aba diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py index 56ac9165e..1069232ff 100644 --- a/spacy/lang/id/examples.py +++ b/spacy/lang/id/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index 1d4584ae3..3167f4659 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata from .punctuation import LIST_CURRENCY diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py index 09ac6a6d3..63d2081e9 100644 --- a/spacy/lang/id/norm_exceptions.py +++ b/spacy/lang/id/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Daftar kosakata yang sering salah dieja # https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja _exc = { diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py index e4794d42b..f6c2387d8 100644 --- a/spacy/lang/id/punctuation.py +++ b/spacy/lang/id/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py index 0a9f91947..b1bfaea79 100644 --- a/spacy/lang/id/stop_words.py +++ b/spacy/lang/id/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index 16391a840..3bd08e96a 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PRON, AUX, SCONJ, INTJ, PART, PROPN diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 86fe611bf..5259bddf8 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py index 18e41432d..cdcfd6e71 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/is/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index e4ae0498b..917fb6df4 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/Xangis/extra-stopwords STOP_WORDS = set( diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 90763eda5..4b223582b 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py index af66b7eca..506721276 100644 --- a/spacy/lang/it/examples.py +++ b/spacy/lang/it/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4fa931fde..0b8405cc0 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 84233d381..e97613912 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl diff --git a/spacy/lang/it/tag_map.py b/spacy/lang/it/tag_map.py index 798c45d80..ce0e1d9ee 100644 --- a/spacy/lang/it/tag_map.py +++ b/spacy/lang/it/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 62f568c5c..f1cfba2c0 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...symbols import ORTH, LEMMA _exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..d1ce651d7 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - import re from collections import namedtuple diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py index e00001ed5..c3a011862 100644 --- a/spacy/lang/ja/examples.py +++ b/spacy/lang/ja/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py index bb232a2d2..98560d7e2 100644 --- a/spacy/lang/ja/stop_words.py +++ b/spacy/lang/ja/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # This list was created by taking the top 2000 words from a Wikipedia dump and # filtering out everything that wasn't hiragana. ー (one) was also added. # Considered keeping some non-hiragana words but too many place names were diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..d922cd22b 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index c86354248..ef3b10f81 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index 652341e73..dba9740af 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ ಹಲವು diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index ec79a95ab..4ecdfbc58 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from ...attrs import LANG diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py index 7885ad801..cc0a66c0a 100644 --- a/spacy/lang/ko/examples.py +++ b/spacy/lang/ko/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py index 1904a0ece..ac5bc7e48 100644 --- a/spacy/lang/ko/lex_attrs.py +++ b/spacy/lang/ko/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py index 676dca1b4..3eba9fc82 100644 --- a/spacy/lang/ko/stop_words.py +++ b/spacy/lang/ko/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ 이 diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py index 57317c969..26a8c56b9 100644 --- a/spacy/lang/ko/tag_map.py +++ b/spacy/lang/ko/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON from ...symbols import VERB, ADV, PROPN, NUM, DET diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 4fcfaddb4..afcf77f33 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/lb/examples.py b/spacy/lang/lb/examples.py index 3cbba31d9..a7a10489c 100644 --- a/spacy/lang/lb/examples.py +++ b/spacy/lang/lb/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py index e38c74974..d2d50d9dc 100644 --- a/spacy/lang/lb/lex_attrs.py +++ b/spacy/lang/lb/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py index 7063e6863..afc384228 100644 --- a/spacy/lang/lb/norm_exceptions.py +++ b/spacy/lang/lb/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # TODO # norm execptions: find a possibility to deal with the zillions of spelling # variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 1571e13d7..4886b316c 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "") diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py index 41e6f79d2..8f22ea6e6 100644 --- a/spacy/lang/lb/stop_words.py +++ b/spacy/lang/lb/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ a diff --git a/spacy/lang/lb/tag_map.py b/spacy/lang/lb/tag_map.py index 424a83bb4..cd2e8b93c 100644 --- a/spacy/lang/lb/tag_map.py +++ b/spacy/lang/lb/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PART, SPACE, AUX diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index b32daa58c..ebf624281 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM # TODO diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 7c0ed8a04..339290d4a 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata import re diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 7919a4858..0f096a5b7 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py index 99dbe9d4d..eaf941f1a 100644 --- a/spacy/lang/lt/examples.py +++ b/spacy/lang/lt/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lt/lex_attrs.py b/spacy/lang/lt/lex_attrs.py index 81879948f..28894a59b 100644 --- a/spacy/lang/lt/lex_attrs.py +++ b/spacy/lang/lt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = { diff --git a/spacy/lang/lt/morph_rules.py b/spacy/lang/lt/morph_rules.py index 3bf26d9d8..f7bfd3cc6 100644 --- a/spacy/lang/lt/morph_rules.py +++ b/spacy/lang/lt/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/lt/stop_words.py b/spacy/lang/lt/stop_words.py index fed05d80d..8c11b3f7b 100644 --- a/spacy/lang/lt/stop_words.py +++ b/spacy/lang/lt/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = { "a", "abejais", diff --git a/spacy/lang/lt/tag_map.py b/spacy/lang/lt/tag_map.py index 6ea4f8ae0..f08db535f 100644 --- a/spacy/lang/lt/tag_map.py +++ b/spacy/lang/lt/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADJ, ADP, ADV, CONJ, INTJ, NOUN, NUM, PART from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index fcf807278..e4b53e5b7 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH _exc = {} diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py index bb8c0763b..dd8919b73 100644 --- a/spacy/lang/lv/__init__.py +++ b/spacy/lang/lv/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index 075ad6347..2685c2430 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-lv STOP_WORDS = set( diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py index fd95f9354..eb52a3935 100644 --- a/spacy/lang/mr/__init__.py +++ b/spacy/lang/mr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 0b0cd035d..9b0cee951 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json STOP_WORDS = set( """ diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 086761f82..3120951a2 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py index c15426ded..b1a63ad74 100644 --- a/spacy/lang/nb/examples.py +++ b/spacy/lang/nb/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nb/morph_rules.py b/spacy/lang/nb/morph_rules.py index e20814535..e96b9fd6b 100644 --- a/spacy/lang/nb/morph_rules.py +++ b/spacy/lang/nb/morph_rules.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # This dict includes all the PRON and DET tag combinations found in the @@ -198,7 +195,7 @@ MORPH_RULES = { "seg": { LEMMA: PRON_LEMMA, "Person": "Three", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Reflex": "Yes", } }, @@ -251,7 +248,7 @@ MORPH_RULES = { }, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Sing", "Poss": "Yes", "Gender": "Masc", @@ -312,7 +309,7 @@ MORPH_RULES = { }, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Sing", "Poss": "Yes", "Gender": "Fem", @@ -373,7 +370,7 @@ MORPH_RULES = { }, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neut", @@ -403,7 +400,7 @@ MORPH_RULES = { "våre": {LEMMA: "vår", "Person": "One", "Number": "Plur", "Poss": "Yes"}, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Plur", "Poss": "Yes", }, @@ -451,21 +448,21 @@ MORPH_RULES = { "PronType": "Prs", "Number": "Sing", "Person": "Three", - "Gender": ("Fem", "Masc"), + "Gender": "Fem,Masc", }, "den": { LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", "Person": "Three", - "Gender": ("Fem", "Masc"), + "Gender": "Fem,Masc", }, "ingen": { LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", "Person": "Three", - "Gender": ("Fem", "Masc"), + "Gender": "Fem,Masc", "Polarity": "Neg", }, }, @@ -478,7 +475,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", - "Case": ("Gen", "Nom"), + "Case": "Gen,Nom", } }, "PRON__Animacy=Anim|Case=Gen|Number=Sing|PronType=Prs": { diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index b49aa9838..5d5800ae3 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index caa2012e7..fd65dd788 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ alle allerede alt and andre annen annet at av diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py index ca0ece265..a67586ed9 100644 --- a/spacy/lang/nb/tag_map.py +++ b/spacy/lang/nb/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X from ...symbols import VERB, NOUN, PROPN, PART, INTJ, PRON, AUX diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 92ac09841..ef6dcf264 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 074fd9133..c12b08d77 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py index a459760f4..8c8c50c60 100644 --- a/spacy/lang/nl/examples.py +++ b/spacy/lang/nl/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 9a92bee44..e7501ec52 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py index 69343b589..f1acaefeb 100644 --- a/spacy/lang/nl/lex_attrs.py +++ b/spacy/lang/nl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py index a48ecc044..3f3be61f8 100644 --- a/spacy/lang/nl/punctuation.py +++ b/spacy/lang/nl/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py index 44551f2d4..a2c6198e7 100644 --- a/spacy/lang/nl/stop_words.py +++ b/spacy/lang/nl/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # The original stop words list (added in f46ffe3) was taken from # http://www.damienvanholten.com/downloads/dutch-stop-words.txt # and consisted of about 100 tokens. diff --git a/spacy/lang/nl/tag_map.py b/spacy/lang/nl/tag_map.py index 4fde5d39f..5bd7747c6 100644 --- a/spacy/lang/nl/tag_map.py +++ b/spacy/lang/nl/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, SPACE, PRON, CONJ diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py index dbdd104f3..12ab8aef5 100644 --- a/spacy/lang/nl/tokenizer_exceptions.py +++ b/spacy/lang/nl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH # Extensive list of both common and uncommon dutch abbreviations copied from diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py index 341967a78..f35f613b1 100644 --- a/spacy/lang/norm_exceptions.py +++ b/spacy/lang/norm_exceptions.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 702a19063..a03ead1ff 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py index 839eccb83..965318442 100644 --- a/spacy/lang/pl/_tokenizer_exceptions_list.py +++ b/spacy/lang/pl/_tokenizer_exceptions_list.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - # The following list consists of: # - exceptions generated from polish_srx_rules [1] # (https://github.com/milekpl/polish_srx_rules) diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py index 14b6c7030..b1ea5880f 100644 --- a/spacy/lang/pl/examples.py +++ b/spacy/lang/pl/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py index f1379aa50..ce56e28a8 100644 --- a/spacy/lang/pl/lex_attrs.py +++ b/spacy/lang/pl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py index 4e69a3912..eea28de11 100644 --- a/spacy/lang/pl/punctuation.py +++ b/spacy/lang/pl/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 11df67328..075aec391 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 - -from __future__ import unicode_literals - # sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl STOP_WORDS = set( diff --git a/spacy/lang/pl/tag_map.py b/spacy/lang/pl/tag_map.py index 5356c26cb..b83ee4d4c 100644 --- a/spacy/lang/pl/tag_map.py +++ b/spacy/lang/pl/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ( POS, ADJ, diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index 9e4814b0f..39f3017ed 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index f786d6542..0557e8b31 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py index b7206ffd7..13f3512cf 100644 --- a/spacy/lang/pt/examples.py +++ b/spacy/lang/pt/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py index 4ad0eeecb..3c6979ab4 100644 --- a/spacy/lang/pt/lex_attrs.py +++ b/spacy/lang/pt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py index ea650cb31..e115b0385 100644 --- a/spacy/lang/pt/norm_exceptions.py +++ b/spacy/lang/pt/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py index 370e6aaad..08e31f9d0 100644 --- a/spacy/lang/pt/punctuation.py +++ b/spacy/lang/pt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 774b06809..ff45ad3a7 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes diff --git a/spacy/lang/pt/tag_map.py b/spacy/lang/pt/tag_map.py index cdc7de57e..dc65998a4 100644 --- a/spacy/lang/pt/tag_map.py +++ b/spacy/lang/pt/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, CCONJ from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 5169780e6..2089ea8fa 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, NORM diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index ccb72de28..bf7357e48 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 6c325b74d..e32ae19cb 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py index a372d7cb2..bfa258ffc 100644 --- a/spacy/lang/ro/examples.py +++ b/spacy/lang/ro/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py index bb8391ad1..0f86f53cd 100644 --- a/spacy/lang/ro/lex_attrs.py +++ b/spacy/lang/ro/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index b5ba73458..1d90be85d 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-ro STOP_WORDS = set( """ diff --git a/spacy/lang/ro/tag_map.py b/spacy/lang/ro/tag_map.py index cb5239809..d6820b4f2 100644 --- a/spacy/lang/ro/tag_map.py +++ b/spacy/lang/ro/tag_map.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from ...symbols import POS, ADJ, ADP, ADV, INTJ, NOUN, NUM, PART from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X, CCONJ, SCONJ, DET, AUX diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py index a7fb38453..8408ef987 100644 --- a/spacy/lang/ro/tokenizer_exceptions.py +++ b/spacy/lang/ro/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index f34fc5435..d25e8048b 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py index 2db621dac..adb007625 100644 --- a/spacy/lang/ru/examples.py +++ b/spacy/lang/ru/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 96d32f59c..ed0e858f5 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer -from ...compat import unicode_ class RussianLemmatizer(Lemmatizer): @@ -85,7 +81,7 @@ class RussianLemmatizer(Lemmatizer): @staticmethod def normalize_univ_pos(univ_pos): - if isinstance(univ_pos, unicode_): + if isinstance(univ_pos, str): return univ_pos.upper() symbols_to_str = { diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 448c5b285..7979c7ea6 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py index 43e08948c..0975bf5b8 100644 --- a/spacy/lang/ru/norm_exceptions.py +++ b/spacy/lang/ru/norm_exceptions.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - _exc = { # Slang "прив": "привет", diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 89069b3cf..16cb55ef9 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ а diff --git a/spacy/lang/ru/tag_map.py b/spacy/lang/ru/tag_map.py index baf065588..294919811 100644 --- a/spacy/lang/ru/tag_map.py +++ b/spacy/lang/ru/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index ea7b5b20d..df3169baf 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py index a58a63f03..3b065860c 100644 --- a/spacy/lang/si/__init__.py +++ b/spacy/lang/si/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py index 842dfdd7e..b34051d00 100644 --- a/spacy/lang/si/examples.py +++ b/spacy/lang/si/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/si/lex_attrs.py b/spacy/lang/si/lex_attrs.py index 5d5f06187..aa061852d 100644 --- a/spacy/lang/si/lex_attrs.py +++ b/spacy/lang/si/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index 8bbdec6b7..bde662bf7 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ අතර diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py index cb17c0b6d..c7b171de4 100644 --- a/spacy/lang/sk/__init__.py +++ b/spacy/lang/sk/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/sk/examples.py b/spacy/lang/sk/examples.py index 486ea375e..736109a7c 100644 --- a/spacy/lang/sk/examples.py +++ b/spacy/lang/sk/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sk/lex_attrs.py b/spacy/lang/sk/lex_attrs.py index 3dea4d8f0..0caf62e8e 100644 --- a/spacy/lang/sk/lex_attrs.py +++ b/spacy/lang/sk/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index 3e78acb10..017e7beef 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/Ardevop-sk/stopwords-sk STOP_WORDS = set( diff --git a/spacy/lang/sk/tag_map.py b/spacy/lang/sk/tag_map.py index 015c8cba3..d159a6a51 100644 --- a/spacy/lang/sk/tag_map.py +++ b/spacy/lang/sk/tag_map.py @@ -1,1467 +1,1464 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON +from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB +from ...symbols import NOUN, PART, INTJ, PRON # Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html # fmt: off TAG_MAP = { - "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "Dx": {POS: ADV, "morph": "Degree=Pos"}, - "Dy": {POS: ADV, "morph": "Degree=Cmp"}, - "Dz": {POS: ADV, "morph": "Degree=Sup"}, - "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, - "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, - "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, - "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, - "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, - "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, - "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, - "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, - "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, - "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, - "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, - "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "J": {POS: INTJ, "morph": "_"}, - "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "ND": {POS: NUM, "morph": "MorphPos=Adv"}, - "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "O": {POS: CCONJ, "morph": "_"}, - "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, - "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, - "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "Q": {POS: X, "morph": "Hyph=Yes"}, - "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, - "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "T": {POS: PART, "morph": "_"}, - "TY": {POS: PART, "morph": "Mood=Cnd"}, - "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, - "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, - "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, - "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, - "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, - "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, - "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, - "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, - "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, - "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, - "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, - "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, - "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "W": {POS: X, "morph": "Abbr=Yes"}, - "Y": {POS: AUX, "morph": "Mood=Cnd"}, + "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "Dx": {POS: ADV, "morph": "Degree=Pos"}, + "Dy": {POS: ADV, "morph": "Degree=Cmp"}, + "Dz": {POS: ADV, "morph": "Degree=Sup"}, + "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, + "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, + "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, + "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, + "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, + "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, + "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, + "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, + "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, + "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, + "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, + "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "J": {POS: INTJ, "morph": "_"}, + "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "ND": {POS: NUM, "morph": "MorphPos=Adv"}, + "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "O": {POS: CCONJ, "morph": "_"}, + "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, + "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, + "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "Q": {POS: X, "morph": "Hyph=Yes"}, + "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, + "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "T": {POS: PART, "morph": "_"}, + "TY": {POS: PART, "morph": "Mood=Cnd"}, + "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, + "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, + "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, + "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, + "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, + "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, + "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, + "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, + "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, + "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, + "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, + "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, + "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "W": {POS: X, "morph": "Abbr=Yes"}, + "Y": {POS: AUX, "morph": "Mood=Cnd"}, } diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py index 2d4977bdf..ce46e92dc 100644 --- a/spacy/lang/sl/__init__.py +++ b/spacy/lang/sl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 187e95876..6fb01a183 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-sl # TODO: probably needs to be tidied up – the list seems to have month names in # it, which shouldn't be considered stop words. diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py index 6f33b37c2..034604838 100644 --- a/spacy/lang/sq/__init__.py +++ b/spacy/lang/sq/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py index c51a0da39..06ed20fa1 100644 --- a/spacy/lang/sq/examples.py +++ b/spacy/lang/sq/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index f91861ca1..f2b1a4f4a 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/andrixh/index-albanian STOP_WORDS = set( diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index f27b87102..151cc231c 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py index d636220c3..ec7f57ced 100644 --- a/spacy/lang/sr/examples.py +++ b/spacy/lang/sr/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index c90dc0da7..dc48909bc 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py index 69f2c3173..723ab84c0 100644 --- a/spacy/lang/sr/norm_exceptions.py +++ b/spacy/lang/sr/norm_exceptions.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - _exc = { # Slang "ћале": "отац", diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 9712327f8..5df5509d2 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ а diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index 8fca346a3..82df15186 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 671eefca0..d400eae4d 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py index 58e095195..bc6cd7a54 100644 --- a/spacy/lang/sv/examples.py +++ b/spacy/lang/sv/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sv/morph_rules.py b/spacy/lang/sv/morph_rules.py index 77744813f..3ef6aedc5 100644 --- a/spacy/lang/sv/morph_rules.py +++ b/spacy/lang/sv/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA @@ -108,7 +105,7 @@ MORPH_RULES = { "PronType": "Prs", "Person": "Three", "Number": "Plur", - "Case": ("Nom", "Acc"), + "Case": "Nom,Acc", }, "dem": { LEMMA: PRON_LEMMA, @@ -169,7 +166,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes", @@ -178,7 +175,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes", @@ -187,7 +184,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Poss": "Yes", "Reflex": "Yes", }, @@ -275,7 +272,7 @@ MORPH_RULES = { "VBZ": { "är": { "VerbForm": "Fin", - "Person": ("One", "Two", "Three"), + "Person": "One,Two,Three", "Tense": "Pres", "Mood": "Ind", } diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 206abce5a..2422b2a9e 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 7a82e6b59..021d5d2f5 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/sv/tag_map.py b/spacy/lang/sv/tag_map.py index 7d4e29030..d4f5b6291 100644 --- a/spacy/lang/sv/tag_map.py +++ b/spacy/lang/sv/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index dd0976aa6..834a088ad 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG _exc = {} diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index cb23339e6..d7a04afea 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..a53227220 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py index 40158ad7a..f830f4ac9 100644 --- a/spacy/lang/ta/lex_attrs.py +++ b/spacy/lang/ta/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py index fbdceb98c..8eaf0aa74 100644 --- a/spacy/lang/ta/norm_exceptions.py +++ b/spacy/lang/ta/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Regional words normal # Sri Lanka - wikipeadia diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index 91ebe8fd8..abbff949d 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Stop words STOP_WORDS = set( diff --git a/spacy/lang/tag_map.py b/spacy/lang/tag_map.py index 3a744f180..5bff905bd 100644 --- a/spacy/lang/tag_map.py +++ b/spacy/lang/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py index a4709177d..424164cc7 100644 --- a/spacy/lang/te/__init__.py +++ b/spacy/lang/te/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py index 815ec8227..cff7d3cb0 100644 --- a/spacy/lang/te/examples.py +++ b/spacy/lang/te/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/te/lex_attrs.py b/spacy/lang/te/lex_attrs.py index 6da766dca..ae11827f6 100644 --- a/spacy/lang/te/lex_attrs.py +++ b/spacy/lang/te/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py index 11e157177..b18dab697 100644 --- a/spacy/lang/te/stop_words.py +++ b/spacy/lang/te/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Xangis/extra-stopwords (MIT License) STOP_WORDS = set( diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 06970fbd7..950a77818 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py index 047d046c2..bc4e5293e 100644 --- a/spacy/lang/th/lex_attrs.py +++ b/spacy/lang/th/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py index ed1b3e760..b8ddbab16 100644 --- a/spacy/lang/th/norm_exceptions.py +++ b/spacy/lang/th/norm_exceptions.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - _exc = { # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) "สนุ๊กเกอร์": "สนุกเกอร์", diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 119a2f6a0..7fb12d538 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX, VERB from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py index 4de0f1195..0529b3a99 100644 --- a/spacy/lang/th/tokenizer_exceptions.py +++ b/spacy/lang/th/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 30ad93139..f477029f7 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py index 61dc9d4f3..60bdc923b 100644 --- a/spacy/lang/tl/lex_attrs.py +++ b/spacy/lang/tl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index 510b3a418..2560cdaed 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ akin diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py index 77e1fb0c6..ea14746c4 100644 --- a/spacy/lang/tl/tokenizer_exceptions.py +++ b/spacy/lang/tl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 2c0fc9cf7..ee58a7b09 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from .char_classes import ALPHA_LOWER @@ -37,7 +34,7 @@ URL_PATTERN = ( r"|" # host & domain names # mods: match is case-sensitive, so include [A-Z] - "(?:" + "(?:" # noqa: E131 "(?:" "[A-Za-z0-9\u00a1-\uffff]" "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" @@ -127,7 +124,6 @@ emoticons = set( (-: =) (= -") :] :-] [: diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 2553e7c0f..a29d78261 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py index a0464dfe3..dfb324a4e 100644 --- a/spacy/lang/tr/examples.py +++ b/spacy/lang/tr/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.tr.examples import sentences diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py index 93f26fc8e..3dbc1833a 100644 --- a/spacy/lang/tr/lex_attrs.py +++ b/spacy/lang/tr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py index 65905499a..85dcff6a5 100644 --- a/spacy/lang/tr/stop_words.py +++ b/spacy/lang/tr/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-tr STOP_WORDS = set( """ diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py index f48e035d4..97f524a87 100644 --- a/spacy/lang/tr/tokenizer_exceptions.py +++ b/spacy/lang/tr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, NORM _exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]} diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index 3655e6264..80574a70d 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS diff --git a/spacy/lang/tt/examples.py b/spacy/lang/tt/examples.py index ac668a0c2..723fcdd15 100644 --- a/spacy/lang/tt/examples.py +++ b/spacy/lang/tt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.tt.examples import sentences diff --git a/spacy/lang/tt/lex_attrs.py b/spacy/lang/tt/lex_attrs.py index ad3d6b9eb..a2ae03061 100644 --- a/spacy/lang/tt/lex_attrs.py +++ b/spacy/lang/tt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py index 9ee66a59e..f644a8ccb 100644 --- a/spacy/lang/tt/punctuation.py +++ b/spacy/lang/tt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS from ..char_classes import LIST_ELLIPSES, LIST_ICONS diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py index 9f6e9bb86..44169b757 100644 --- a/spacy/lang/tt/stop_words.py +++ b/spacy/lang/tt/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Tatar stopwords are from https://github.com/aliiae/stopwords-tt STOP_WORDS = set( diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py index 89f7a990b..efe9e1fc0 100644 --- a/spacy/lang/tt/tokenizer_exceptions.py +++ b/spacy/lang/tt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM _exc = {} diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index e74ff2d86..51165112a 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py index 4f2b034eb..f75d44488 100644 --- a/spacy/lang/uk/examples.py +++ b/spacy/lang/uk/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 3eeed5dd4..ff61d711f 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,4 +1,3 @@ -# coding: utf8 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer diff --git a/spacy/lang/uk/lex_attrs.py b/spacy/lang/uk/lex_attrs.py index 0ade751d6..510e5b85d 100644 --- a/spacy/lang/uk/lex_attrs.py +++ b/spacy/lang/uk/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index cdf24dd70..b11d7a044 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """а або diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py deleted file mode 100644 index 472e772ef..000000000 --- a/spacy/lang/uk/tag_map.py +++ /dev/null @@ -1,28 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE}, -} diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index a94d77af3..36f0b2e72 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, POS, NORM, NOUN diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index 6eea0cf3b..c7f65adc3 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py index f47c11600..e55b337be 100644 --- a/spacy/lang/ur/examples.py +++ b/spacy/lang/ur/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py index 12d85be4b..e590ed3e3 100644 --- a/spacy/lang/ur/lex_attrs.py +++ b/spacy/lang/ur/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM # Source https://quizlet.com/4271889/1-100-urdu-number-wordsurdu-numerals-flash-cards/ diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py index b8b1a1c83..5d35d0a25 100644 --- a/spacy/lang/ur/punctuation.py +++ b/spacy/lang/ur/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py index 73c159d5c..abfa36497 100644 --- a/spacy/lang/ur/stop_words.py +++ b/spacy/lang/ur/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: collected from different resource on internet STOP_WORDS = set( """ diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index 2499d7e3e..d990fd46a 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON @@ -13,8 +10,8 @@ TAG_MAP = { '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "$": {POS: SYM, "SymType": "currency"}, + "#": {POS: SYM, "SymType": "numbersign"}, "AFX": {POS: ADJ, "Hyph": "yes"}, "CC": {POS: CCONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, @@ -55,7 +52,7 @@ TAG_MAP = { "VerbForm": "fin", "Tense": "pres", "Number": "sing", - "Person": 3, + "Person": "3", }, "WDT": {POS: ADJ, "PronType": "int|rel"}, "WP": {POS: NOUN, "PronType": "int|rel"}, diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 425f84e3d..7496763ee 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LANG, NORM from ..norm_exceptions import BASE_NORMS from ...language import Language diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py index b6cd1188a..b3dbf2192 100644 --- a/spacy/lang/vi/lex_attrs.py +++ b/spacy/lang/vi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py index 13284dc59..1d2ecdf8d 100644 --- a/spacy/lang/vi/stop_words.py +++ b/spacy/lang/vi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords/vietnamese-stopwords STOP_WORDS = set( """ diff --git a/spacy/lang/vi/tag_map.py b/spacy/lang/vi/tag_map.py deleted file mode 100644 index 472e772ef..000000000 --- a/spacy/lang/vi/tag_map.py +++ /dev/null @@ -1,28 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE}, -} diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index 66d8c7917..347c624fd 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 38cd5e0cd..8d63c3c20 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py index f227203cc..08e3166e1 100644 --- a/spacy/lang/yo/__init__.py +++ b/spacy/lang/yo/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py index 170ddc803..0a610f125 100644 --- a/spacy/lang/yo/examples.py +++ b/spacy/lang/yo/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py index a9f1b85f6..ead68ced2 100644 --- a/spacy/lang/yo/lex_attrs.py +++ b/spacy/lang/yo/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata from ...attrs import LIKE_NUM diff --git a/spacy/lang/yo/stop_words.py b/spacy/lang/yo/stop_words.py index 53d382ad3..5c7a7fc45 100644 --- a/spacy/lang/yo/stop_words.py +++ b/spacy/lang/yo/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # stop words as whitespace-separated list. # Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8179b4551..e427dc6d2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LANG from ...language import Language from ...tokens import Doc diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py index b28215741..8be1336d2 100644 --- a/spacy/lang/zh/examples.py +++ b/spacy/lang/zh/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py index 0b29c226e..08c8e3160 100644 --- a/spacy/lang/zh/lex_attrs.py +++ b/spacy/lang/zh/lex_attrs.py @@ -1,8 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals import re + from ...attrs import LIKE_NUM + _single_num_words = [ "〇", "一", diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py index 0af4c1859..42ae4a1de 100644 --- a/spacy/lang/zh/stop_words.py +++ b/spacy/lang/zh/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # stop words as whitespace-separated list # Chinese stop words,maybe not enough STOP_WORDS = set( diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 41e2d2158..1ff0827be 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE diff --git a/spacy/language.py b/spacy/language.py index 5544b6341..1c6014cec 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,15 +1,10 @@ -# coding: utf8 -from __future__ import absolute_import, unicode_literals - import random import itertools -from spacy.util import minibatch import weakref import functools -from collections import OrderedDict from contextlib import contextmanager from copy import copy, deepcopy -from thinc.neural import Model +from thinc.api import get_current_ops import srsly import multiprocessing as mp from itertools import chain, cycle @@ -19,10 +14,9 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs -from .compat import izip, basestring_, is_python2, class_types -from .gold import GoldParse +from .gold import Example from .scorer import Scorer -from ._ml import link_vectors_to_models, create_default_optimizer +from .util import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -30,7 +24,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings, deprecation_warning from . import util from . import about @@ -188,7 +182,7 @@ class Language(object): self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", ">={}".format(about.__version__)) + self._meta.setdefault("spacy_version", f">={about.__version__}") self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") @@ -231,6 +225,10 @@ class Language(object): def linker(self): return self.get_pipe("entity_linker") + @property + def sentrec(self): + return self.get_pipe("sentrec") + @property def matcher(self): return self.get_pipe("matcher") @@ -261,7 +259,7 @@ class Language(object): RETURNS (dict): Labels keyed by component name. """ - labels = OrderedDict() + labels = {} for name, pipe in self.pipeline: if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) @@ -318,7 +316,7 @@ class Language(object): """ if not hasattr(component, "__call__"): msg = Errors.E003.format(component=repr(component), name=name) - if isinstance(component, basestring_) and component in self.factories: + if isinstance(component, str) and component in self.factories: msg += Errors.E004.format(component=component) raise ValueError(msg) if name is None: @@ -370,7 +368,7 @@ class Language(object): raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) if not hasattr(component, "__call__"): msg = Errors.E003.format(component=repr(component), name=name) - if isinstance(component, basestring_) and component in self.factories: + if isinstance(component, str) and component in self.factories: msg += Errors.E135.format(name=name) raise ValueError(msg) self.pipeline[self.pipe_names.index(name)] = (name, component) @@ -409,7 +407,7 @@ class Language(object): def __call__(self, text, disable=[], component_cfg=None): """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string + and can contain arbitrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. @@ -452,30 +450,10 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def _format_docs_and_golds(self, docs, golds): - """Format golds and docs before update models.""" - expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links") - gold_objs = [] - doc_objs = [] - for doc, gold in zip(docs, golds): - if isinstance(doc, basestring_): - doc = self.make_doc(doc) - if not isinstance(gold, GoldParse): - unexpected = [k for k in gold if k not in expected_keys] - if unexpected: - err = Errors.E151.format(unexp=unexpected, exp=expected_keys) - raise ValueError(err) - gold = GoldParse(doc, **gold) - doc_objs.append(doc) - gold_objs.append(gold) - - return doc_objs, gold_objs - - def update(self, docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None): + def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None): """Update the models in the pipeline. - docs (iterable): A batch of `Doc` objects. - golds (iterable): A batch of `GoldParse` objects. + examples (iterable): A batch of `Example` or `Doc` objects. drop (float): The dropout rate. sgd (callable): An optimizer. losses (dict): Dictionary to update with the loss, keyed by component. @@ -484,46 +462,41 @@ class Language(object): DOCS: https://spacy.io/api/language#update """ - if len(docs) != len(golds): - raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds))) - if len(docs) == 0: + if len(examples) == 0: return + examples = Example.to_example_objects(examples, make_doc=self.make_doc) + if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer(Model.ops) + self._optimizer = create_default_optimizer() sgd = self._optimizer - # Allow dict of args to GoldParse, instead of GoldParse objects. - docs, golds = self._format_docs_and_golds(docs, golds) - grads = {} - def get_grads(W, dW, key=None): - grads[key] = (W, dW) - - get_grads.alpha = sgd.alpha - get_grads.b1 = sgd.b1 - get_grads.b2 = sgd.b2 - pipes = list(self.pipeline) - random.shuffle(pipes) if component_cfg is None: component_cfg = {} - for name, proc in pipes: + # Determine whether component should set annotations. In theory I guess + # we should do this by inspecting the meta? Or we could just always + # say "yes" + for name, proc in self.pipeline: + component_cfg.setdefault(name, {}) + component_cfg[name].setdefault("drop", drop) + component_cfg[name].setdefault("set_annotations", False) + for name, proc in self.pipeline: if not hasattr(proc, "update"): continue - grads = {} - kwargs = component_cfg.get(name, {}) - kwargs.setdefault("drop", drop) - proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs) - for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) + if sgd is not False: + for name, proc in self.pipeline: + if hasattr(proc, "model"): + proc.model.finish_update(sgd) - def rehearse(self, docs, sgd=None, losses=None, config=None): + def rehearse(self, examples, sgd=None, losses=None, config=None): """Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples. - docs (iterable): A batch of `Doc` objects. + examples (iterable): A batch of `Doc` objects. drop (float): The dropout rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. @@ -531,22 +504,18 @@ class Language(object): EXAMPLE: >>> raw_text_batches = minibatch(raw_texts) >>> for labelled_batch in minibatch(zip(train_docs, train_golds)): - >>> docs, golds = zip(*train_docs) - >>> nlp.update(docs, golds) + >>> nlp.update(labelled_batch) >>> raw_batch = [nlp.make_doc(text) for text in next(raw_text_batches)] >>> nlp.rehearse(raw_batch) """ # TODO: document - if len(docs) == 0: + if len(examples) == 0: return + examples = Example.to_example_objects(examples, make_doc=self.make_doc) if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer(Model.ops) + self._optimizer = create_default_optimizer() sgd = self._optimizer - docs = list(docs) - for i, doc in enumerate(docs): - if isinstance(doc, basestring_): - docs[i] = self.make_doc(doc) pipes = list(self.pipeline) random.shuffle(pipes) if config is None: @@ -556,60 +525,64 @@ class Language(object): def get_grads(W, dW, key=None): grads[key] = (W, dW) - get_grads.alpha = sgd.alpha + get_grads.learn_rate = sgd.learn_rate get_grads.b1 = sgd.b1 get_grads.b2 = sgd.b2 for name, proc in pipes: if not hasattr(proc, "rehearse"): continue grads = {} - proc.rehearse(docs, sgd=get_grads, losses=losses, **config.get(name, {})) - for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) + proc.rehearse( + examples, sgd=get_grads, losses=losses, **config.get(name, {}) + ) + for key, (W, dW) in grads.items(): + sgd(W, dW, key=key) return losses - def preprocess_gold(self, docs_golds): + def preprocess_gold(self, examples): """Can be called before training to pre-process gold data. By default, it handles nonprojectivity and adds missing tags to the tag map. - docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. - YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. + examples (iterable): `Example` objects. + YIELDS (tuple): `Example` objects. """ for name, proc in self.pipeline: if hasattr(proc, "preprocess_gold"): - docs_golds = proc.preprocess_gold(docs_golds) - for doc, gold in docs_golds: - yield doc, gold + examples = proc.preprocess_gold(examples) + for ex in examples: + yield ex - def begin_training(self, get_gold_tuples=None, sgd=None, component_cfg=None, **cfg): + def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. - get_gold_tuples (function): Function returning gold data + get_examples (function): Function returning example training data (TODO: document format change since 3.0) component_cfg (dict): Config parameters for specific components. **cfg: Config parameters. RETURNS: An optimizer. DOCS: https://spacy.io/api/language#begin_training """ - if get_gold_tuples is None: - get_gold_tuples = lambda: [] + # TODO: throw warning when get_gold_tuples is provided instead of get_examples + if get_examples is None: + get_examples = lambda: [] # Populate vocab else: - for _, annots_brackets in get_gold_tuples(): - _ = annots_brackets.pop() - for annots, _ in annots_brackets: - for word in annots[1]: - _ = self.vocab[word] # noqa: F841 + for example in get_examples(): + for word in example.token_annotation.words: + _ = self.vocab[word] # noqa: F841 + if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data) + ops = get_current_ops() + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: cfg["pretrained_vectors"] = self.vocab.vectors.name + cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1] if sgd is None: - sgd = create_default_optimizer(Model.ops) + sgd = create_default_optimizer() self._optimizer = sgd if component_cfg is None: component_cfg = {} @@ -618,11 +591,9 @@ class Language(object): kwargs = component_cfg.get(name, {}) kwargs.update(cfg) proc.begin_training( - get_gold_tuples, - pipeline=self.pipeline, - sgd=self._optimizer, - **kwargs + get_examples, pipeline=self.pipeline, sgd=self._optimizer, **kwargs ) + self._link_components() return self._optimizer def resume_training(self, sgd=None, **cfg): @@ -636,13 +607,14 @@ class Language(object): """ if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) + ops = get_current_ops() if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data) + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors.name + cfg["pretrained_vectors"] = self.vocab.vectors if sgd is None: - sgd = create_default_optimizer(Model.ops) + sgd = create_default_optimizer() self._optimizer = sgd for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): @@ -650,11 +622,11 @@ class Language(object): return self._optimizer def evaluate( - self, docs_golds, verbose=False, batch_size=256, scorer=None, component_cfg=None + self, examples, verbose=False, batch_size=256, scorer=None, component_cfg=None ): """Evaluate a model's pipeline components. - docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. + examples (iterable): `Example` objects. verbose (bool): Print debugging information. batch_size (int): Batch size to use. scorer (Scorer): Optional `Scorer` to use. If not passed in, a new one @@ -665,30 +637,24 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ + examples = Example.to_example_objects(examples, make_doc=self.make_doc) if scorer is None: scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: component_cfg = {} - docs, golds = zip(*docs_golds) - docs = [ - self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs - ] - golds = list(golds) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) if not hasattr(pipe, "pipe"): - docs = _pipe(docs, pipe, kwargs) + examples = _pipe(examples, pipe, kwargs) else: - docs = pipe.pipe(docs, **kwargs) - for doc, gold in zip(docs, golds): - if not isinstance(gold, GoldParse): - gold = GoldParse(doc, **gold) + examples = pipe.pipe(examples, as_example=True, **kwargs) + for ex in examples: if verbose: - print(doc) + print(ex.doc) kwargs = component_cfg.get("scorer", {}) kwargs.setdefault("verbose", verbose) - scorer.score(doc, gold, **kwargs) + scorer.score(ex, **kwargs) return scorer @contextmanager @@ -733,6 +699,7 @@ class Language(object): cleanup=False, component_cfg=None, n_process=1, + as_example=False, ): """Process texts as a stream, and yield `Doc` objects in order. @@ -754,9 +721,6 @@ class Language(object): """ # raw_texts will be used later to stop iterator. texts, raw_texts = itertools.tee(texts) - if is_python2 and n_process != 1: - user_warning(Warnings.W023) - n_process = 1 if n_threads != -1: deprecation_warning(Warnings.W016) if n_process == -1: @@ -771,8 +735,9 @@ class Language(object): disable=disable, n_process=n_process, component_cfg=component_cfg, + as_example=as_example, ) - for doc, context in izip(docs, contexts): + for doc, context in zip(docs, contexts): yield (doc, context) return if component_cfg is None: @@ -842,7 +807,7 @@ class Language(object): *[mp.Pipe(False) for _ in range(n_process)] ) - batch_texts = minibatch(texts, batch_size) + batch_texts = util.minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) @@ -872,6 +837,16 @@ class Language(object): for proc in procs: proc.terminate() + def _link_components(self): + """Register 'listeners' within pipeline components, to allow them to + effectively share weights. + """ + for i, (name1, proc1) in enumerate(self.pipeline): + if hasattr(proc1, "find_listeners"): + for name2, proc2 in self.pipeline[i:]: + if hasattr(proc2, "model"): + proc1.find_listeners(proc2.model) + def to_disk(self, path, exclude=tuple(), disable=None): """Save the current state to a directory. If a model is loaded, this will include the model. @@ -886,7 +861,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) - serializers = OrderedDict() + serializers = {} serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( p, exclude=["vocab"] ) @@ -919,7 +894,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) - deserializers = OrderedDict() + deserializers = {} deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk( p @@ -940,6 +915,7 @@ class Language(object): exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) self._path = path + self._link_components() return self def to_bytes(self, exclude=tuple(), disable=None, **kwargs): @@ -953,7 +929,7 @@ class Language(object): if disable is not None: deprecation_warning(Warnings.W014) exclude = disable - serializers = OrderedDict() + serializers = {} serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) @@ -978,7 +954,7 @@ class Language(object): if disable is not None: deprecation_warning(Warnings.W014) exclude = disable - deserializers = OrderedDict() + deserializers = {} deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes( b @@ -996,6 +972,7 @@ class Language(object): ) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_bytes(bytes_data, deserializers, exclude) + self._link_components() return self @@ -1037,7 +1014,7 @@ class component(object): def factory(nlp, **cfg): if hasattr(obj, "from_nlp"): return obj.from_nlp(nlp, **cfg) - elif isinstance(obj, class_types): + elif isinstance(obj, type): return obj() return obj @@ -1053,7 +1030,7 @@ def _fix_pretrained_vectors_name(nlp): elif not nlp.vocab.vectors.size: nlp.vocab.vectors.name = None elif "name" in nlp.meta and "lang" in nlp.meta: - vectors_name = "%s_%s.vectors" % (nlp.meta["lang"], nlp.meta["name"]) + vectors_name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" nlp.vocab.vectors.name = vectors_name else: raise ValueError(Errors.E092) @@ -1096,15 +1073,15 @@ class DisabledPipes(list): self[:] = [] -def _pipe(docs, proc, kwargs): +def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) for arg in ["n_threads", "batch_size"]: if arg in kwargs: kwargs.pop(arg) - for doc in docs: - doc = proc(doc, **kwargs) - yield doc + for ex in examples: + ex = proc(ex, **kwargs) + yield ex def _apply_pipes(make_doc, pipes, reciever, sender): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d70e4cfc4..3ba86c169 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,8 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import OrderedDict - from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups @@ -160,7 +155,7 @@ class Lemmatizer(object): else: oov_forms.append(form) # Remove duplicates but preserve the ordering of applied "rules" - forms = list(OrderedDict.fromkeys(forms)) + forms = list(dict.fromkeys(forms)) # Put exceptions at the front of the list, so they get priority. # This is a dodgy heuristic -- but it's the best we can do until we get # frequencies on this. We can at least prune out problematic exceptions, diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5c981bc25..5910ebfe1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,7 +1,4 @@ # cython: embedsignature=True -# coding: utf8 -from __future__ import unicode_literals, print_function - # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray from libc.string cimport memset @@ -9,7 +6,7 @@ cimport numpy as np np.import_array() import numpy -from thinc.neural.util import get_array_module +from thinc.api import get_array_module from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE diff --git a/spacy/lookups.py b/spacy/lookups.py index bf250b4b4..a9d371b79 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - import srsly -from collections import OrderedDict from preshed.bloom import BloomFilter +from collections import OrderedDict from .errors import Errors from .util import SimpleFrozenDict, ensure_path @@ -28,7 +25,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#init """ - self._tables = OrderedDict() + self._tables = {} def __contains__(self, name): """Check if the lookups contain a table of a given name. Delegates to @@ -118,7 +115,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#from_bytes """ - self._tables = OrderedDict() + self._tables = {} for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) self._tables[key].update(value) @@ -254,12 +251,12 @@ class Table(OrderedDict): DOCS: https://spacy.io/api/lookups#table.to_bytes """ - data = [ - ("name", self.name), - ("dict", dict(self.items())), - ("bloom", self.bloom.to_bytes()), - ] - return srsly.msgpack_dumps(OrderedDict(data)) + data = { + "name": self.name, + "dict": dict(self.items()), + "bloom": self.bloom.to_bytes(), + } + return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data): """Load a table from a bytestring. diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py index 91874ed43..286844787 100644 --- a/spacy/matcher/__init__.py +++ b/spacy/matcher/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .matcher import Matcher from .phrasematcher import PhraseMatcher from .dependencymatcher import DependencyMatcher diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py deleted file mode 100644 index 1b10f0dd5..000000000 --- a/spacy/matcher/_schemas.py +++ /dev/null @@ -1,200 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -TOKEN_PATTERN_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "definitions": { - "string_value": { - "anyOf": [ - {"type": "string"}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": {"type": "array", "items": {"type": "string"}}, - "NOT_IN": {"type": "array", "items": {"type": "string"}}, - }, - "additionalProperties": False, - }, - ] - }, - "integer_value": { - "anyOf": [ - {"type": "integer"}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": {"type": "array", "items": {"type": "integer"}}, - "NOT_IN": {"type": "array", "items": {"type": "integer"}}, - "==": {"type": "integer"}, - ">=": {"type": "integer"}, - "<=": {"type": "integer"}, - ">": {"type": "integer"}, - "<": {"type": "integer"}, - }, - "additionalProperties": False, - }, - ] - }, - "boolean_value": {"type": "boolean"}, - "underscore_value": { - "anyOf": [ - {"type": ["string", "integer", "number", "array", "boolean", "null"]}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": { - "type": "array", - "items": {"type": ["string", "integer"]}, - }, - "NOT_IN": { - "type": "array", - "items": {"type": ["string", "integer"]}, - }, - "==": {"type": "integer"}, - ">=": {"type": "integer"}, - "<=": {"type": "integer"}, - ">": {"type": "integer"}, - "<": {"type": "integer"}, - }, - "additionalProperties": False, - }, - ] - }, - }, - "type": "array", - "items": { - "type": "object", - "properties": { - "ORTH": { - "title": "Verbatim token text", - "$ref": "#/definitions/string_value", - }, - "TEXT": { - "title": "Verbatim token text (spaCy v2.1+)", - "$ref": "#/definitions/string_value", - }, - "LOWER": { - "title": "Lowercase form of token text", - "$ref": "#/definitions/string_value", - }, - "POS": { - "title": "Coarse-grained part-of-speech tag", - "$ref": "#/definitions/string_value", - }, - "TAG": { - "title": "Fine-grained part-of-speech tag", - "$ref": "#/definitions/string_value", - }, - "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, - "LEMMA": { - "title": "Lemma (base form)", - "$ref": "#/definitions/string_value", - }, - "SHAPE": { - "title": "Abstract token shape", - "$ref": "#/definitions/string_value", - }, - "ENT_TYPE": { - "title": "Entity label of single token", - "$ref": "#/definitions/string_value", - }, - "NORM": { - "title": "Normalized form of the token text", - "$ref": "#/definitions/string_value", - }, - "LENGTH": { - "title": "Token character length", - "$ref": "#/definitions/integer_value", - }, - "IS_ALPHA": { - "title": "Token consists of alphabetic characters", - "$ref": "#/definitions/boolean_value", - }, - "IS_ASCII": { - "title": "Token consists of ASCII characters", - "$ref": "#/definitions/boolean_value", - }, - "IS_DIGIT": { - "title": "Token consists of digits", - "$ref": "#/definitions/boolean_value", - }, - "IS_LOWER": { - "title": "Token is lowercase", - "$ref": "#/definitions/boolean_value", - }, - "IS_UPPER": { - "title": "Token is uppercase", - "$ref": "#/definitions/boolean_value", - }, - "IS_TITLE": { - "title": "Token is titlecase", - "$ref": "#/definitions/boolean_value", - }, - "IS_PUNCT": { - "title": "Token is punctuation", - "$ref": "#/definitions/boolean_value", - }, - "IS_SPACE": { - "title": "Token is whitespace", - "$ref": "#/definitions/boolean_value", - }, - "IS_BRACKET": { - "title": "Token is a bracket", - "$ref": "#/definitions/boolean_value", - }, - "IS_QUOTE": { - "title": "Token is a quotation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_LEFT_PUNCT": { - "title": "Token is a left punctuation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_RIGHT_PUNCT": { - "title": "Token is a right punctuation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_CURRENCY": { - "title": "Token is a currency symbol", - "$ref": "#/definitions/boolean_value", - }, - "IS_STOP": { - "title": "Token is stop word", - "$ref": "#/definitions/boolean_value", - }, - "IS_SENT_START": { - "title": "Token is the first in a sentence", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_NUM": { - "title": "Token resembles a number", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_URL": { - "title": "Token resembles a URL", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_EMAIL": { - "title": "Token resembles an email address", - "$ref": "#/definitions/boolean_value", - }, - "_": { - "title": "Custom extension token attributes (token._.)", - "type": "object", - "patternProperties": { - "^.*$": {"$ref": "#/definitions/underscore_value"} - }, - }, - "OP": { - "title": "Operators / quantifiers", - "type": "string", - "enum": ["+", "*", "?", "!"], - }, - }, - "additionalProperties": False, - }, -} diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 56d27024d..f94c66cb0 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap @@ -41,7 +39,8 @@ cdef class DependencyMatcher: RETURNS (DependencyMatcher): The newly constructed object. """ size = 20 - self.token_matcher = Matcher(vocab) + # TODO: make matcher work with validation + self.token_matcher = Matcher(vocab, validate=False) self._keys_to_token = {} self._patterns = {} self._root = {} @@ -131,7 +130,7 @@ cdef class DependencyMatcher: # TODO: Better ways to hash edges in pattern? for j in range(len(_patterns[i])): k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j)) - self.token_matcher.add(k, None, _patterns[i][j]) + self.token_matcher.add(k, [_patterns[i][j]]) _keys_to_token[k] = j _keys_to_token_list.append(_keys_to_token) self._keys_to_token.setdefault(key, []) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index dd04153bf..689734079 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -63,7 +63,7 @@ cdef class Matcher: cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab - cdef public object validator + cdef public object validate cdef public object _patterns cdef public object _callbacks cdef public object _extensions diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 30ef3dd36..4258fdb6a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from libcpp.vector cimport vector from libc.stdint cimport int32_t from cymem.cymem cimport Pool @@ -17,8 +15,7 @@ from ..tokens.doc cimport Doc, get_token_attr from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA -from ._schemas import TOKEN_PATTERN_SCHEMA -from ..util import get_json_validator, validate_json +from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning from ..strings import get_string_id from ..attrs import IDS @@ -34,7 +31,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=False): + def __init__(self, vocab, validate=True): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -48,10 +45,7 @@ cdef class Matcher: self._seen_attrs = set() self.vocab = vocab self.mem = Pool() - if validate: - self.validator = get_json_validator(TOKEN_PATTERN_SCHEMA) - else: - self.validator = None + self.validate = validate def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -121,8 +115,8 @@ cdef class Matcher: raise ValueError(Errors.E012.format(key=key)) if not isinstance(pattern, list): raise ValueError(Errors.E178.format(pat=pattern, key=key)) - if self.validator: - errors[i] = validate_json(pattern, self.validator) + if self.validate: + errors[i] = validate_token_pattern(pattern) if any(err for err in errors.values()): raise MatchPatternError(key, errors) key = self._normalize_key(key) @@ -670,8 +664,6 @@ def _get_attr_values(spec, string_store): continue if attr == "TEXT": attr = "ORTH" - if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: - raise ValueError(Errors.E152.format(attr=attr)) attr = IDS.get(attr) if isinstance(value, basestring): value = string_store.add(value) @@ -686,7 +678,7 @@ def _get_attr_values(spec, string_store): if attr is not None: attr_values.append((attr, value)) else: - # should be caught above using TOKEN_PATTERN_SCHEMA + # should be caught in validation raise ValueError(Errors.E152.format(attr=attr)) return attr_values diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4de5782f9..961a318f6 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from libc.stdint cimport uintptr_t from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter @@ -11,7 +9,7 @@ from ..structs cimport TokenC from ..tokens.token cimport Token from ..typedefs cimport attr_t -from ._schemas import TOKEN_PATTERN_SCHEMA +from ..schemas import TokenPattern from ..errors import Errors, Warnings, deprecation_warning, user_warning @@ -56,7 +54,7 @@ cdef class PhraseMatcher: attr = attr.upper() if attr == "TEXT": attr = "ORTH" - if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: + if attr.lower() not in TokenPattern().dict(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = self.vocab.strings[attr] diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index 57e7ef571..e69de29bb 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1,5 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .tok2vec import Tok2Vec # noqa: F401 -from .common import FeedForward, LayerNormalizedMaxout # noqa: F401 diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py new file mode 100644 index 000000000..b366f67c6 --- /dev/null +++ b/spacy/ml/_character_embed.py @@ -0,0 +1,54 @@ +from thinc.api import Model + + +def CharacterEmbed(nM, nC): + # nM: Number of dimensions per character. nC: Number of characters. + nO = nM * nC if (nM is not None and nC is not None) else None + return Model( + "charembed", + forward, + init=init, + dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256}, + params={"E": None}, + ).initialize() + + +def init(model, X=None, Y=None): + vectors_table = model.ops.alloc3f( + model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM") + ) + model.set_param("E", vectors_table) + + +def forward(model, docs, is_train): + if not docs: + return [] + ids = [] + output = [] + E = model.get_param("E") + nC = model.get_dim("nC") + nM = model.get_dim("nM") + nO = model.get_dim("nO") + # This assists in indexing; it's like looping over this dimension. + # Still consider this weird witch craft...But thanks to Mark Neumann + # for the tip. + nCv = model.ops.xp.arange(nC) + for doc in docs: + doc_ids = doc.to_utf8_array(nr_char=nC) + doc_vectors = model.ops.alloc3f(len(doc), nC, nM) + # Let's say I have a 2d array of indices, and a 3d table of data. What numpy + # incantation do I chant to get + # output[i, j, k] == data[j, ids[i, j], k]? + doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] + output.append(doc_vectors.reshape((len(doc), nO))) + ids.append(doc_ids) + + def backprop(d_output): + dE = model.ops.alloc(E.shape, dtype=E.dtype) + for doc_ids, d_doc_vectors in zip(ids, d_output): + d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM)) + dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] + model.inc_grad("E", dE) + return [] + + return output, backprop diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py new file mode 100644 index 000000000..7e9150d8b --- /dev/null +++ b/spacy/ml/_layers.py @@ -0,0 +1,162 @@ +from thinc.api import Model, normal_init + + +def PrecomputableAffine(nO, nI, nF, nP): + model = Model( + "precomputable_affine", + forward, + init=init, + dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, + params={"W": None, "b": None, "pad": None}, + ) + model.initialize() + return model + + +def forward(model, X, is_train): + nF = model.get_dim("nF") + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + W = model.get_param("W") + Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True) + Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) + Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) + + def backward(dY_ids): + # This backprop is particularly tricky, because we get back a different + # thing from what we put out. We put out an array of shape: + # (nB, nF, nO, nP), and get back: + # (nB, nO, nP) and ids (nB, nF) + # The ids tell us the values of nF, so we would have: + # + # dYf = zeros((nB, nF, nO, nP)) + # for b in range(nB): + # for f in range(nF): + # dYf[b, ids[b, f]] += dY[b] + # + # However, we avoid building that array for efficiency -- and just pass + # in the indices. + dY, ids = dY_ids + assert dY.ndim == 3 + assert dY.shape[1] == nO, dY.shape + assert dY.shape[2] == nP, dY.shape + # nB = dY.shape[0] + model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids)) + Xf = X[ids] + Xf = Xf.reshape((Xf.shape[0], nF * nI)) + + model.inc_grad("b", dY.sum(axis=0)) + dY = dY.reshape((dY.shape[0], nO * nP)) + + Wopfi = W.transpose((1, 2, 0, 3)) + Wopfi = model.ops.xp.ascontiguousarray(Wopfi) + Wopfi = Wopfi.reshape((nO * nP, nF * nI)) + dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) + + # Reuse the buffer + dWopfi = Wopfi + dWopfi.fill(0.0) + model.ops.gemm(dY, Xf, out=dWopfi, trans1=True) + dWopfi = dWopfi.reshape((nO, nP, nF, nI)) + # (o, p, f, i) --> (f, o, p, i) + model.inc_grad("W", dWopfi.transpose((2, 0, 1, 3))) + return dXf.reshape((dXf.shape[0], nF, nI)) + + return Yf, backward + + +def _backprop_precomputable_affine_padding(model, dY, ids): + nB = dY.shape[0] + nF = model.get_dim("nF") + nP = model.get_dim("nP") + nO = model.get_dim("nO") + # Backprop the "padding", used as a filler for missing values. + # Values that are missing are set to -1, and each state vector could + # have multiple missing values. The padding has different values for + # different missing features. The gradient of the padding vector is: + # + # for b in range(nB): + # for f in range(nF): + # if ids[b, f] < 0: + # d_padding[0, f] += dY[b] + # + # Which can be rewritten as: + # + # for b in range(nB): + # d_pad[0, ids[b] < 0] += dY[b] + # + # I don't know how to avoid the loop without building a whole array :(. + # Cursed numpy. + d_pad = model.ops.alloc((1, nF, nO, nP)) + for b in range(nB): + d_pad[0, ids[b] < 0] += dY[b] + return d_pad + + +def init(model, X=None, Y=None): + """This is like the 'layer sequential unit variance', but instead + of taking the actual inputs, we randomly generate whitened data. + + Why's this all so complicated? We have a huge number of inputs, + and the maxout unit makes guessing the dynamics tricky. Instead + we set the maxout weights to values that empirically result in + whitened outputs given whitened inputs. + """ + if model.has_param("W") and model.get_param("W").any(): + return + + nF = model.get_dim("nF") + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + W = model.ops.alloc4f(nF, nO, nP, nI) + b = model.ops.alloc2f(nO, nP) + pad = model.ops.alloc4f(1, nF, nO, nP) + + ops = model.ops + W = normal_init(ops, W.shape, fan_in=nF * nI) + model.set_param("W", W) + model.set_param("b", b) + model.set_param("pad", pad) + + ids = ops.alloc((5000, nF), dtype="f") + ids += ops.xp.random.uniform(0, 1000, ids.shape) + ids = ops.asarray(ids, dtype="i") + tokvecs = ops.alloc((5000, nI), dtype="f") + tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( + tokvecs.shape + ) + + def predict(ids, tokvecs): + # nS ids. nW tokvecs. Exclude the padding array. + hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p) + vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f") + # need nS vectors + hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP)) + model.ops.scatter_add(vectors, ids.flatten(), hiddens) + vectors = vectors.reshape((vectors.shape[0], nO, nP)) + vectors += b + vectors = model.ops.asarray(vectors) + if nP >= 2: + return model.ops.maxout(vectors)[0] + else: + return vectors * (vectors >= 0) + + tol_var = 0.01 + tol_mean = 0.01 + t_max = 10 + W = model.get_param("W").copy() + b = model.get_param("b").copy() + for t_i in range(t_max): + acts1 = predict(ids, tokvecs) + var = model.ops.xp.var(acts1) + mean = model.ops.xp.mean(acts1) + if abs(var - 1.0) >= tol_var: + W /= model.ops.xp.sqrt(var) + model.set_param("W", W) + elif abs(mean) >= tol_mean: + b -= mean + model.set_param("b", b) + else: + break diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py deleted file mode 100644 index b077a46b7..000000000 --- a/spacy/ml/_legacy_tok2vec.py +++ /dev/null @@ -1,131 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals -from thinc.v2v import Model, Maxout -from thinc.i2v import HashEmbed, StaticVectors -from thinc.t2t import ExtractWindow -from thinc.misc import Residual -from thinc.misc import LayerNorm as LN -from thinc.misc import FeatureExtracter -from thinc.api import layerize, chain, clone, concatenate, with_flatten -from thinc.api import uniqued, wrap, noop - -from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE - - -def Tok2Vec(width, embed_size, **kwargs): - # Circular imports :( - from .._ml import CharacterEmbed - from .._ml import PyTorchBiLSTM - - pretrained_vectors = kwargs.get("pretrained_vectors", None) - cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) - subword_features = kwargs.get("subword_features", True) - char_embed = kwargs.get("char_embed", False) - if char_embed: - subword_features = False - conv_depth = kwargs.get("conv_depth", 4) - bilstm_depth = kwargs.get("bilstm_depth", 0) - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm") - if subword_features: - prefix = HashEmbed( - width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix" - ) - suffix = HashEmbed( - width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix" - ) - shape = HashEmbed( - width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape" - ) - else: - prefix, suffix, shape = (None, None, None) - if pretrained_vectors is not None: - glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - - if subword_features: - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> LN(Maxout(width, width * 5, pieces=3)), - column=cols.index(ORTH), - ) - else: - embed = uniqued( - (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)), - column=cols.index(ORTH), - ) - elif subword_features: - embed = uniqued( - (norm | prefix | suffix | shape) - >> LN(Maxout(width, width * 4, pieces=3)), - column=cols.index(ORTH), - ) - elif char_embed: - embed = concatenate_lists( - CharacterEmbed(nM=64, nC=8), - FeatureExtracter(cols) >> with_flatten(norm), - ) - reduce_dimensions = LN( - Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces) - ) - else: - embed = norm - - convolution = Residual( - ExtractWindow(nW=1) - >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) - ) - if char_embed: - tok2vec = embed >> with_flatten( - reduce_dimensions >> convolution ** conv_depth, pad=conv_depth - ) - else: - tok2vec = FeatureExtracter(cols) >> with_flatten( - embed >> convolution ** conv_depth, pad=conv_depth - ) - - if bilstm_depth >= 1: - tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) - # Work around thinc API limitations :(. TODO: Revise in Thinc 7 - tok2vec.nO = width - tok2vec.embed = embed - return tok2vec - - -@layerize -def flatten(seqs, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=0) - - X = ops.flatten(seqs, pad=0) - return X, finish_update - - -def concatenate_lists(*layers, **kwargs): # pragma: no cover - """Compose two or more models `f`, `g`, etc, such that their outputs are - concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - """ - if not layers: - return noop() - drop_factor = kwargs.get("drop_factor", 1.0) - ops = layers[0].ops - layers = [chain(layer, flatten) for layer in layers] - concat = concatenate(*layers) - - def concatenate_lists_fwd(Xs, drop=0.0): - if drop is not None: - drop *= drop_factor - lengths = ops.asarray([len(X) for X in Xs], dtype="i") - flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) - ys = ops.unflatten(flat_y, lengths) - - def concatenate_lists_bwd(d_ys, sgd=None): - return bp_flat_y(ops.flatten(d_ys), sgd=sgd) - - return ys, concatenate_lists_bwd - - model = wrap(concatenate_lists_fwd, concat) - return model diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py deleted file mode 100644 index fa271b37c..000000000 --- a/spacy/ml/_wire.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import unicode_literals -from thinc.api import layerize, wrap, noop, chain, concatenate -from thinc.v2v import Model - - -def concatenate_lists(*layers, **kwargs): # pragma: no cover - """Compose two or more models `f`, `g`, etc, such that their outputs are - concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - """ - if not layers: - return layerize(noop()) - drop_factor = kwargs.get("drop_factor", 1.0) - ops = layers[0].ops - layers = [chain(layer, flatten) for layer in layers] - concat = concatenate(*layers) - - def concatenate_lists_fwd(Xs, drop=0.0): - if drop is not None: - drop *= drop_factor - lengths = ops.asarray([len(X) for X in Xs], dtype="i") - flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) - ys = ops.unflatten(flat_y, lengths) - - def concatenate_lists_bwd(d_ys, sgd=None): - return bp_flat_y(ops.flatten(d_ys), sgd=sgd) - - return ys, concatenate_lists_bwd - - model = wrap(concatenate_lists_fwd, concat) - return model - - -@layerize -def flatten(seqs, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=0) - - X = ops.flatten(seqs, pad=0) - return X, finish_update diff --git a/spacy/ml/common.py b/spacy/ml/common.py deleted file mode 100644 index f90b53a15..000000000 --- a/spacy/ml/common.py +++ /dev/null @@ -1,23 +0,0 @@ -from __future__ import unicode_literals - -from thinc.api import chain -from thinc.v2v import Maxout -from thinc.misc import LayerNorm -from ..util import registry, make_layer - - -@registry.architectures.register("thinc.FeedForward.v1") -def FeedForward(config): - layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]] - model = chain(*layers) - model.cfg = config - return model - - -@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") -def LayerNormalizedMaxout(config): - width = config["width"] - pieces = config["pieces"] - layer = LayerNorm(Maxout(width, pieces=pieces)) - layer.nO = width - return layer diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py new file mode 100644 index 000000000..8c694f950 --- /dev/null +++ b/spacy/ml/component_models.py @@ -0,0 +1,227 @@ +from spacy import util +from spacy.ml.extract_ngrams import extract_ngrams + +from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE +from ..errors import Errors +from ._character_embed import CharacterEmbed + +from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged +from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors +from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain +from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued +from thinc.api import zero_init + + +def build_text_classifier(arch, config): + if arch == "cnn": + return build_simple_cnn_text_classifier(**config) + elif arch == "bow": + return build_bow_text_classifier(**config) + else: + raise ValueError("Unexpected textcat arch") + + +def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg): + """ + Build a simple CNN text classifier, given a token-to-vector model as inputs. + If exclusive_classes=True, a softmax non-linearity is applied, so that the + outputs sum to 1. If exclusive_classes=False, a logistic non-linearity + is applied instead, so that outputs are in the range [0, 1]. + """ + with Model.define_operators({">>": chain}): + if exclusive_classes: + output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO")) + else: + # TODO: experiment with init_w=zero_init + output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic() + model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model.set_ref("tok2vec", tok2vec) + model.set_dim("nO", nr_class) + return model + + +def build_bow_text_classifier( + nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg +): + with Model.define_operators({">>": chain}): + model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class) + model.to_cpu() + if not no_output_layer: + output_layer = ( + Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class) + ) + output_layer.to_cpu() + model = model >> output_layer + model.set_dim("nO", nr_class) + return model + + +def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): + if "entity_width" not in cfg: + raise ValueError(Errors.E144.format(param="entity_width")) + + conv_depth = cfg.get("conv_depth", 2) + cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) + pretrained_vectors = cfg.get("pretrained_vectors", None) + context_width = cfg.get("entity_width") + + with Model.define_operators({">>": chain, "**": clone}): + nel_tok2vec = Tok2Vec( + width=hidden_width, + embed_size=embed_width, + pretrained_vectors=pretrained_vectors, + cnn_maxout_pieces=cnn_maxout_pieces, + subword_features=True, + conv_depth=conv_depth, + bilstm_depth=0, + ) + + model = ( + nel_tok2vec + >> list2ragged() + >> reduce_mean() + >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0)) + >> Linear(nO=context_width, nI=hidden_width) + ) + model.initialize() + + model.set_ref("tok2vec", nel_tok2vec) + model.set_dim("nO", context_width) + return model + + +def masked_language_model(*args, **kwargs): + raise NotImplementedError + + +def build_tagger_model(nr_class, tok2vec): + token_vector_width = tok2vec.get_dim("nO") + # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! + softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init)) + model = chain(tok2vec, softmax) + model.set_ref("tok2vec", tok2vec) + model.set_ref("softmax", softmax) + return model + + +def build_morphologizer_model(class_nums, **cfg): + embed_size = util.env_opt("embed_size", 7000) + if "token_vector_width" in cfg: + token_vector_width = cfg["token_vector_width"] + else: + token_vector_width = util.env_opt("token_vector_width", 128) + pretrained_vectors = cfg.get("pretrained_vectors") + char_embed = cfg.get("char_embed", True) + with Model.define_operators({">>": chain, "+": add, "**": clone}): + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] + else: + tok2vec = Tok2Vec( + token_vector_width, + embed_size, + char_embed=char_embed, + pretrained_vectors=pretrained_vectors, + ) + softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width)) + model = tok2vec >> softmax + model.set_ref("tok2vec", tok2vec) + model.set_ref("softmax", softmax) + return model + + +def Tok2Vec( + width, + embed_size, + pretrained_vectors=None, + window_size=1, + cnn_maxout_pieces=3, + subword_features=True, + char_embed=False, + conv_depth=4, + bilstm_depth=0, +): + if char_embed: + subword_features = False + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0) + if subword_features: + prefix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0 + ) + suffix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0 + ) + shape = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0 + ) + else: + prefix, suffix, shape = (None, None, None) + if pretrained_vectors is not None: + glove = StaticVectors( + vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0 + ) + + if subword_features: + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> Maxout( + nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True + ), + column=cols.index(ORTH), + ) + else: + embed = uniqued( + (glove | norm) + >> Maxout( + nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True + ), + column=cols.index(ORTH), + ) + elif subword_features: + embed = uniqued( + concatenate(norm, prefix, suffix, shape) + >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True), + column=cols.index(ORTH), + ) + elif char_embed: + embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array( + norm + ) + reduce_dimensions = Maxout( + nO=width, + nI=64 * 8 + width, + nP=cnn_maxout_pieces, + dropout=0.0, + normalize=True, + ) + else: + embed = norm + + convolution = residual( + expand_window(window_size=window_size) + >> Maxout( + nO=width, + nI=width * 3, + nP=cnn_maxout_pieces, + dropout=0.0, + normalize=True, + ) + ) + if char_embed: + tok2vec = embed >> with_array( + reduce_dimensions >> convolution ** conv_depth, pad=conv_depth + ) + else: + tok2vec = FeatureExtractor(cols) >> with_array( + embed >> convolution ** conv_depth, pad=conv_depth + ) + + if bilstm_depth >= 1: + tok2vec = tok2vec >> PyTorchLSTM( + nO=width, nI=width, depth=bilstm_depth, bi=True + ) + # Work around thinc API limitations :(. TODO: Revise in Thinc 7 + tok2vec.set_dim("nO", width) + tok2vec.set_ref("embed", embed) + return tok2vec diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py new file mode 100644 index 000000000..d4195b9a4 --- /dev/null +++ b/spacy/ml/extract_ngrams.py @@ -0,0 +1,36 @@ +import numpy +from thinc.api import Model + +from ..attrs import LOWER + + +def extract_ngrams(ngram_size, attr=LOWER) -> Model: + model = Model("extract_ngrams", forward) + model.attrs["ngram_size"] = ngram_size + model.attrs["attr"] = attr + return model + + +def forward(self, docs, is_train: bool): + batch_keys = [] + batch_vals = [] + for doc in docs: + unigrams = doc.to_array([self.attrs["attr"]]) + ngrams = [unigrams] + for n in range(2, self.attrs["ngram_size"] + 1): + ngrams.append(self.ops.ngrams(n, unigrams)) + keys = self.ops.xp.concatenate(ngrams) + keys, vals = self.ops.xp.unique(keys, return_counts=True) + batch_keys.append(keys) + batch_vals.append(vals) + # The dtype here matches what thinc is expecting -- which differs per + # platform (by int definition). This should be fixed once the problem + # is fixed on Thinc's side. + lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_) + batch_keys = self.ops.xp.concatenate(batch_keys) + batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") + + def backprop(dY): + return dY + + return (batch_keys, batch_vals, lengths), backprop diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 8f86475ef..5e51bc47a 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,13 +1,9 @@ -from __future__ import unicode_literals +from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop +from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors +from thinc.api import residual, LayerNorm, FeatureExtractor -from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued -from thinc.api import noop, with_square_sequences -from thinc.v2v import Maxout, Model -from thinc.i2v import HashEmbed, StaticVectors -from thinc.t2t import ExtractWindow -from thinc.misc import Residual, LayerNorm, FeatureExtracter +from ..ml import _character_embed from ..util import make_layer, registry -from ._wire import concatenate_lists @registry.architectures.register("spacy.Tok2Vec.v1") @@ -15,19 +11,21 @@ def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) - field_size = getattr(encode, "receptive_field", 0) - tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size)) - tok2vec.cfg = config - tok2vec.nO = encode.nO - tok2vec.embed = embed - tok2vec.encode = encode + field_size = 0 + if encode.has_attr("receptive_field"): + field_size = encode.attrs["receptive_field"] + tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) + tok2vec.attrs["cfg"] = config + tok2vec.set_dim("nO", encode.get_dim("nO")) + tok2vec.set_ref("embed", embed) + tok2vec.set_ref("encode", encode) return tok2vec @registry.architectures.register("spacy.Doc2Feats.v1") def Doc2Feats(config): columns = config["columns"] - return FeatureExtracter(columns) + return FeatureExtractor(columns) @registry.architectures.register("spacy.MultiHashEmbed.v1") @@ -42,55 +40,47 @@ def MultiHashEmbed(config): width = config["width"] rows = config["rows"] - norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm") + norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0) if config["use_subwords"]: - prefix = HashEmbed( - width, rows // 2, column=cols.index("PREFIX"), name="embed_prefix" - ) - suffix = HashEmbed( - width, rows // 2, column=cols.index("SUFFIX"), name="embed_suffix" - ) - shape = HashEmbed( - width, rows // 2, column=cols.index("SHAPE"), name="embed_shape" - ) + prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0) + suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0) + shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0) if config.get("@pretrained_vectors"): glove = make_layer(config["@pretrained_vectors"]) mix = make_layer(config["@mix"]) with Model.define_operators({">>": chain, "|": concatenate}): if config["use_subwords"] and config["@pretrained_vectors"]: - mix._layers[0].nI = width * 5 + mix._layers[0].set_dim("nI", width * 5) layer = uniqued( (glove | norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH"), ) elif config["use_subwords"]: - mix._layers[0].nI = width * 4 + mix._layers[0].set_dim("nI", width * 4) layer = uniqued( (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") ) elif config["@pretrained_vectors"]: - mix._layers[0].nI = width * 2 + mix._layers[0].set_dim("nI", width * 2) layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),) else: layer = norm - layer.cfg = config + layer.attrs["cfg"] = config return layer @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(config): - from .. import _ml - width = config["width"] chars = config["chars"] - chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars) + chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) other_tables = make_layer(config["@embed_features"]) mix = make_layer(config["@mix"]) - model = chain(concatenate_lists(chr_embed, other_tables), mix) - model.cfg = config + model = chain(concatenate(chr_embed, other_tables), mix) + model.attrs["cfg"] = config return model @@ -100,49 +90,61 @@ def MaxoutWindowEncoder(config): nW = config["window_size"] nP = config["pieces"] depth = config["depth"] - - cnn = chain( - ExtractWindow(nW=nW), LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP)) + cnn = ( + expand_window(window_size=nW), + Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), ) - model = clone(Residual(cnn), depth) - model.nO = nO - model.receptive_field = nW * depth + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) + model.attrs["receptive_field"] = nW * depth return model @registry.architectures.register("spacy.MishWindowEncoder.v1") def MishWindowEncoder(config): - from thinc.v2v import Mish + from thinc.api import Mish nO = config["width"] nW = config["window_size"] depth = config["depth"] - - cnn = chain(ExtractWindow(nW=nW), LayerNorm(Mish(nO, nO * ((nW * 2) + 1)))) - model = clone(Residual(cnn), depth) - model.nO = nO + cnn = chain( + expand_window(window_size=nW), + Mish(nO=nO, nI=nO * ((nW * 2) + 1)), + LayerNorm(nO), + ) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) return model @registry.architectures.register("spacy.PretrainedVectors.v1") def PretrainedVectors(config): - return StaticVectors(config["vectors_name"], config["width"], config["column"]) + # TODO: actual vectors instead of name + return StaticVectors( + vectors=config["vectors_name"], + nO=config["width"], + column=config["column"], + dropout=0.0, + ) @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") def TorchBiLSTMEncoder(config): import torch.nn - from thinc.extra.wrappers import PyTorchWrapperRNN + + # TODO: FIX + from thinc.api import PyTorchRNNWrapper width = config["width"] depth = config["depth"] if depth == 0: - return layerize(noop()) - return with_square_sequences( - PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) + return noop() + return with_padded( + PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) ) +# TODO: update _EXAMPLE_CONFIG = { "@doc2feats": { "arch": "Doc2Feats", diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 1a3cedf97..1e8c255b8 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -2,6 +2,7 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap, PreshMapArray from libc.stdint cimport uint64_t from murmurhash cimport mrmr +cimport numpy as np from .structs cimport TokenC, MorphAnalysisC from .strings cimport StringStore @@ -20,12 +21,11 @@ cdef class Morphology: cdef readonly object tag_names cdef readonly object reverse_index cdef readonly object exc - cdef readonly object _feat_map cdef readonly PreshMapArray _cache cdef readonly int n_tags - cpdef update(self, hash_t morph, features) - cdef hash_t insert(self, MorphAnalysisC tag) except 0 + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * + cdef int insert(self, MorphAnalysisC tag) except -1 cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 @@ -34,8 +34,7 @@ cdef class Morphology: cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 -cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil -cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil -cdef list list_features(const MorphAnalysisC* tag) - -cdef tag_to_json(const MorphAnalysisC* tag) +cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil +cdef list list_features(const MorphAnalysisC* morph) +cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field) +cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c146094a9..3003d118f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,12 +1,9 @@ # cython: infer_types -# coding: utf8 -from __future__ import unicode_literals - from libc.string cimport memset import srsly from collections import Counter +import numpy -from .compat import basestring_ from .strings import get_string_id from . import symbols from .attrs cimport POS, IS_SPACE @@ -14,130 +11,38 @@ from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme -from .errors import Errors +from .errors import Errors, Warnings, user_warning from .util import ensure_path -cdef enum univ_field_t: - Field_POS - Field_Abbr - Field_AdpType - Field_AdvType - Field_Animacy - Field_Aspect - Field_Case - Field_ConjType - Field_Connegative - Field_Definite - Field_Degree - Field_Derivation - Field_Echo - Field_Foreign - Field_Gender - Field_Hyph - Field_InfForm - Field_Mood - Field_NameType - Field_Negative - Field_NounType - Field_Number - Field_NumForm - Field_NumType - Field_NumValue - Field_PartForm - Field_PartType - Field_Person - Field_Polarity - Field_Polite - Field_Poss - Field_Prefix - Field_PrepCase - Field_PronType - Field_PunctSide - Field_PunctType - Field_Reflex - Field_Style - Field_StyleVariant - Field_Tense - Field_Typo - Field_VerbForm - Field_VerbType - Field_Voice - - def _normalize_props(props): - """Transform deprecated string keys to correct names.""" + """Convert attrs dict so that POS is always by ID, other features are left + as is as long as they are strings or IDs. + """ out = {} props = dict(props) - for key in FIELDS: - if key in props: - value = str(props[key]).lower() - # We don't have support for disjunctive int|rel features, so - # just take the first one :( - if "|" in value: - value = value.split("|")[0] - attr = '%s_%s' % (key, value) - if attr in FEATURES: - props.pop(key) - props[attr] = True for key, value in props.items(): + # convert POS value to ID if key == POS: if hasattr(value, 'upper'): value = value.upper() if value in POS_IDS: value = POS_IDS[value] out[key] = value - elif isinstance(key, int): - out[key] = value - elif value is True: - out[key] = value - elif key.lower() == 'pos': + elif isinstance(key, str) and key.lower() == 'pos': out[POS] = POS_IDS[value.upper()] - elif key.lower() != 'morph': + # sort values + elif isinstance(value, str) and Morphology.VALUE_SEP in value: + out[key] = Morphology.VALUE_SEP.join( + sorted(value.split(Morphology.VALUE_SEP))) + # accept any string or ID fields and values + elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value + else: + user_warning(Warnings.W028.format(feature={key: value})) return out -class MorphologyClassMap(object): - def __init__(self, features): - self.features = tuple(features) - self.fields = [] - self.feat2field = {} - seen_fields = set() - for feature in features: - field = feature.split("_", 1)[0] - if field not in seen_fields: - self.fields.append(field) - seen_fields.add(field) - self.feat2field[feature] = FIELDS[field] - self.id2feat = {get_string_id(name): name for name in features} - self.field2feats = {"POS": []} - self.col2info = [] - self.attr2field = dict(LOWER_FIELDS.items()) - self.feat2offset = {} - self.field2col = {} - self.field2id = dict(FIELDS.items()) - self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()} - for feature in features: - field = self.fields[self.feat2field[feature]] - if field not in self.field2col: - self.field2col[field] = len(self.col2info) - if field != "POS" and field not in self.field2feats: - self.col2info.append((field, 0, "NIL")) - self.field2feats.setdefault(field, ["NIL"]) - offset = len(self.field2feats[field]) - self.field2feats[field].append(feature) - self.col2info.append((field, offset, feature)) - self.feat2offset[feature] = offset - - @property - def field_sizes(self): - return [len(self.field2feats[field]) for field in self.fields] - - def get_field_offset(self, field): - return self.field2col[field] - - cdef class Morphology: '''Store the possible morphological analyses for a language, and index them by hash. @@ -146,9 +51,15 @@ cdef class Morphology: analysis, so queries of morphological attributes are delegated to this class. ''' - def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): + + FEATURE_SEP = "|" + FIELD_SEP = "=" + VALUE_SEP = "," + EMPTY_MORPH = "_" + + def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): self.mem = Pool() - self.strings = string_store + self.strings = strings self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. @@ -162,7 +73,6 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} - self._feat_map = MorphologyClassMap(FEATURES) self._load_from_tag_map(tag_map) self._cache = PreshMapArray(self.n_tags) @@ -176,8 +86,7 @@ cdef class Morphology: def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) - self.add({self._feat_map.id2feat[feat] for feat in attrs - if feat in self._feat_map.id2feat}) + self.add(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i @@ -186,40 +95,78 @@ cdef class Morphology: self.exc), None, None) def add(self, features): - """Insert a morphological analysis in the morphology table, if not already - present. Returns the hash of the new analysis. + """Insert a morphological analysis in the morphology table, if not + already present. The morphological analysis may be provided in the UD + FEATS format as a string or in the tag map dict format. + Returns the hash of the new analysis. + """ + cdef MorphAnalysisC* tag_ptr + if features == self.EMPTY_MORPH: + features = "" + if isinstance(features, str): + tag_ptr = self.tags.get(self.strings[features]) + if tag_ptr != NULL: + return tag_ptr.key + features = self.feats_to_dict(features) + if not isinstance(features, dict): + user_warning(Warnings.W028.format(feature=features)) + features = {} + features = _normalize_props(features) + string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} + # normalized UFEATS string with sorted fields and values + norm_feats_string = self.FEATURE_SEP.join(sorted([ + self.FIELD_SEP.join([field, values]) + for field, values in string_features.items() + ])) + # intified ("Field", "Field=Value") pairs + field_feature_pairs = [] + for field in sorted(string_features): + values = string_features[field] + for value in values.split(self.VALUE_SEP): + field_feature_pairs.append(( + self.strings.add(field), + self.strings.add(field + self.FIELD_SEP + value), + )) + cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs) + # the hash key for the tag is either the hash of the normalized UFEATS + # string or the hash of an empty placeholder (using the empty string + # would give a hash key of 0, which is not good for PreshMap) + if norm_feats_string: + tag.key = self.strings.add(norm_feats_string) + else: + tag.key = self.strings.add(self.EMPTY_MORPH) + self.insert(tag) + return tag.key + + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: + """Creates a MorphAnalysisC from a list of intified + ("Field", "Field=Value") tuples where fields with multiple values have + been split into individual tuples, e.g.: + [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"), + ("Field2", "Field2=Value3")] """ - for f in features: - if isinstance(f, basestring_): - self.strings.add(f) - string_features = features - features = intify_features(features) - cdef attr_t feature - for feature in features: - if feature != 0 and feature not in self._feat_map.id2feat: - raise ValueError(Errors.E167.format(feat=self.strings[feature], feat_id=feature)) cdef MorphAnalysisC tag - tag = create_rich_tag(features) - cdef hash_t key = self.insert(tag) - return key + tag.length = len(field_feature_pairs) + tag.fields = self.mem.alloc(tag.length, sizeof(attr_t)) + tag.features = self.mem.alloc(tag.length, sizeof(attr_t)) + for i, (field, feature) in enumerate(field_feature_pairs): + tag.fields[i] = field + tag.features[i] = feature + return tag + + cdef int insert(self, MorphAnalysisC tag) except -1: + cdef hash_t key = tag.key + if self.tags.get(key) == NULL: + tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC)) + tag_ptr[0] = tag + self.tags.set(key, tag_ptr) def get(self, hash_t morph): tag = self.tags.get(morph) if tag == NULL: return [] else: - return tag_to_json(tag) - - cpdef update(self, hash_t morph, features): - """Update a morphological analysis with new feature values.""" - tag = (self.tags.get(morph))[0] - features = intify_features(features) - cdef attr_t feature - for feature in features: - field = FEATURE_FIELDS[FEATURE_NAMES[feature]] - set_feature(&tag, field, feature, 1) - morph = self.insert(tag) - return morph + return self.strings[tag.key] def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): if orth not in self.strings: @@ -253,19 +200,10 @@ cdef class Morphology: """ attrs = dict(attrs) attrs = _normalize_props(attrs) - self.add({self._feat_map.id2feat[feat] for feat in attrs - if feat in self._feat_map.id2feat}) + self.add(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs - cdef hash_t insert(self, MorphAnalysisC tag) except 0: - cdef hash_t key = hash_tag(tag) - if self.tags.get(key) == NULL: - tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC)) - tag_ptr[0] = tag - self.tags.set(key, tag_ptr) - return key - cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses the lemmatizer's lookup() method, which looks up the string in the @@ -326,782 +264,60 @@ cdef class Morphology: for form_str, attrs in entries.items(): self.add_special_case(tag_str, form_str, attrs) - @classmethod - def create_class_map(cls): - return MorphologyClassMap(FEATURES) + @staticmethod + def feats_to_dict(feats): + if not feats: + return {} + return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in + [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} + + @staticmethod + def dict_to_feats(feats_dict): + if len(feats_dict) == 0: + return "" + return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()])) + + @staticmethod + def list_to_feats(feats_list): + if len(feats_list) == 0: + return "" + feats_dict = {} + for feat in feats_list: + field, value = feat.split(Morphology.FIELD_SEP) + if field not in feats_dict: + feats_dict[field] = set() + feats_dict[field].add(value) + feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()} + return Morphology.dict_to_feats(feats_dict) -cpdef univ_pos_t get_int_tag(pos_): - return 0 - -cpdef intify_features(features): - return {get_string_id(feature) for feature in features} - -cdef hash_t hash_tag(MorphAnalysisC tag) nogil: - return mrmr.hash64(&tag, sizeof(tag), 0) +cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil: + cdef int i + for i in range(morph.length): + if morph.features[i] == feature: + return True + return False -cdef MorphAnalysisC create_rich_tag(features) except *: - cdef MorphAnalysisC tag - cdef attr_t feature - memset(&tag, 0, sizeof(tag)) - for feature in features: - field = FEATURE_FIELDS[FEATURE_NAMES[feature]] - set_feature(&tag, field, feature, 1) - return tag +cdef list list_features(const MorphAnalysisC* morph): + cdef int i + features = [] + for i in range(morph.length): + features.append(morph.features[i]) + return features -cdef tag_to_json(const MorphAnalysisC* tag): - return [FEATURE_NAMES[f] for f in list_features(tag)] +cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field): + cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64") + n = get_n_by_field(results.data, morph, field) + return results[:n] -cdef MorphAnalysisC tag_from_json(json_tag): - raise NotImplementedError - - -cdef list list_features(const MorphAnalysisC* tag): - output = [] - if tag.abbr != 0: - output.append(tag.abbr) - if tag.adp_type != 0: - output.append(tag.adp_type) - if tag.adv_type != 0: - output.append(tag.adv_type) - if tag.animacy != 0: - output.append(tag.animacy) - if tag.aspect != 0: - output.append(tag.aspect) - if tag.case != 0: - output.append(tag.case) - if tag.conj_type != 0: - output.append(tag.conj_type) - if tag.connegative != 0: - output.append(tag.connegative) - if tag.definite != 0: - output.append(tag.definite) - if tag.degree != 0: - output.append(tag.degree) - if tag.derivation != 0: - output.append(tag.derivation) - if tag.echo != 0: - output.append(tag.echo) - if tag.foreign != 0: - output.append(tag.foreign) - if tag.gender != 0: - output.append(tag.gender) - if tag.hyph != 0: - output.append(tag.hyph) - if tag.inf_form != 0: - output.append(tag.inf_form) - if tag.mood != 0: - output.append(tag.mood) - if tag.negative != 0: - output.append(tag.negative) - if tag.number != 0: - output.append(tag.number) - if tag.name_type != 0: - output.append(tag.name_type) - if tag.noun_type != 0: - output.append(tag.noun_type) - if tag.part_form != 0: - output.append(tag.part_form) - if tag.part_type != 0: - output.append(tag.part_type) - if tag.person != 0: - output.append(tag.person) - if tag.polite != 0: - output.append(tag.polite) - if tag.polarity != 0: - output.append(tag.polarity) - if tag.poss != 0: - output.append(tag.poss) - if tag.prefix != 0: - output.append(tag.prefix) - if tag.prep_case != 0: - output.append(tag.prep_case) - if tag.pron_type != 0: - output.append(tag.pron_type) - if tag.punct_type != 0: - output.append(tag.punct_type) - if tag.reflex != 0: - output.append(tag.reflex) - if tag.style != 0: - output.append(tag.style) - if tag.style_variant != 0: - output.append(tag.style_variant) - if tag.typo != 0: - output.append(tag.typo) - if tag.verb_form != 0: - output.append(tag.verb_form) - if tag.voice != 0: - output.append(tag.voice) - if tag.verb_type != 0: - output.append(tag.verb_type) - return output - - -cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil: - field = field_id - if field == Field_POS: - return tag.pos - if field == Field_Abbr: - return tag.abbr - elif field == Field_AdpType: - return tag.adp_type - elif field == Field_AdvType: - return tag.adv_type - elif field == Field_Animacy: - return tag.animacy - elif field == Field_Aspect: - return tag.aspect - elif field == Field_Case: - return tag.case - elif field == Field_ConjType: - return tag.conj_type - elif field == Field_Connegative: - return tag.connegative - elif field == Field_Definite: - return tag.definite - elif field == Field_Degree: - return tag.degree - elif field == Field_Derivation: - return tag.derivation - elif field == Field_Echo: - return tag.echo - elif field == Field_Foreign: - return tag.foreign - elif field == Field_Gender: - return tag.gender - elif field == Field_Hyph: - return tag.hyph - elif field == Field_InfForm: - return tag.inf_form - elif field == Field_Mood: - return tag.mood - elif field == Field_Negative: - return tag.negative - elif field == Field_Number: - return tag.number - elif field == Field_NameType: - return tag.name_type - elif field == Field_NounType: - return tag.noun_type - elif field == Field_NumForm: - return tag.num_form - elif field == Field_NumType: - return tag.num_type - elif field == Field_NumValue: - return tag.num_value - elif field == Field_PartForm: - return tag.part_form - elif field == Field_PartType: - return tag.part_type - elif field == Field_Person: - return tag.person - elif field == Field_Polite: - return tag.polite - elif field == Field_Polarity: - return tag.polarity - elif field == Field_Poss: - return tag.poss - elif field == Field_Prefix: - return tag.prefix - elif field == Field_PrepCase: - return tag.prep_case - elif field == Field_PronType: - return tag.pron_type - elif field == Field_PunctSide: - return tag.punct_side - elif field == Field_PunctType: - return tag.punct_type - elif field == Field_Reflex: - return tag.reflex - elif field == Field_Style: - return tag.style - elif field == Field_StyleVariant: - return tag.style_variant - elif field == Field_Tense: - return tag.tense - elif field == Field_Typo: - return tag.typo - elif field == Field_VerbForm: - return tag.verb_form - elif field == Field_Voice: - return tag.voice - elif field == Field_VerbType: - return tag.verb_type - else: - raise ValueError(Errors.E168.format(field=field_id)) - - -cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil: - if tag.abbr == feature: - return 1 - elif tag.adp_type == feature: - return 1 - elif tag.adv_type == feature: - return 1 - elif tag.animacy == feature: - return 1 - elif tag.aspect == feature: - return 1 - elif tag.case == feature: - return 1 - elif tag.conj_type == feature: - return 1 - elif tag.connegative == feature: - return 1 - elif tag.definite == feature: - return 1 - elif tag.degree == feature: - return 1 - elif tag.derivation == feature: - return 1 - elif tag.echo == feature: - return 1 - elif tag.foreign == feature: - return 1 - elif tag.gender == feature: - return 1 - elif tag.hyph == feature: - return 1 - elif tag.inf_form == feature: - return 1 - elif tag.mood == feature: - return 1 - elif tag.negative == feature: - return 1 - elif tag.number == feature: - return 1 - elif tag.name_type == feature: - return 1 - elif tag.noun_type == feature: - return 1 - elif tag.num_form == feature: - return 1 - elif tag.num_type == feature: - return 1 - elif tag.num_value == feature: - return 1 - elif tag.part_form == feature: - return 1 - elif tag.part_type == feature: - return 1 - elif tag.person == feature: - return 1 - elif tag.polite == feature: - return 1 - elif tag.polarity == feature: - return 1 - elif tag.poss == feature: - return 1 - elif tag.prefix == feature: - return 1 - elif tag.prep_case == feature: - return 1 - elif tag.pron_type == feature: - return 1 - elif tag.punct_side == feature: - return 1 - elif tag.punct_type == feature: - return 1 - elif tag.reflex == feature: - return 1 - elif tag.style == feature: - return 1 - elif tag.style_variant == feature: - return 1 - elif tag.tense == feature: - return 1 - elif tag.typo == feature: - return 1 - elif tag.verb_form == feature: - return 1 - elif tag.voice == feature: - return 1 - elif tag.verb_type == feature: - return 1 - else: - return 0 - -cdef int set_feature(MorphAnalysisC* tag, - univ_field_t field, attr_t feature, int value) except -1: - if value == True: - value_ = feature - else: - value_ = 0 - prev_value = get_field(tag, field) - if prev_value != 0 and value_ == 0 and field != Field_POS: - tag.length -= 1 - elif prev_value == 0 and value_ != 0 and field != Field_POS: - tag.length += 1 - if feature == 0: - pass - elif field == Field_POS: - tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1]) - elif field == Field_Abbr: - tag.abbr = value_ - elif field == Field_AdpType: - tag.adp_type = value_ - elif field == Field_AdvType: - tag.adv_type = value_ - elif field == Field_Animacy: - tag.animacy = value_ - elif field == Field_Aspect: - tag.aspect = value_ - elif field == Field_Case: - tag.case = value_ - elif field == Field_ConjType: - tag.conj_type = value_ - elif field == Field_Connegative: - tag.connegative = value_ - elif field == Field_Definite: - tag.definite = value_ - elif field == Field_Degree: - tag.degree = value_ - elif field == Field_Derivation: - tag.derivation = value_ - elif field == Field_Echo: - tag.echo = value_ - elif field == Field_Foreign: - tag.foreign = value_ - elif field == Field_Gender: - tag.gender = value_ - elif field == Field_Hyph: - tag.hyph = value_ - elif field == Field_InfForm: - tag.inf_form = value_ - elif field == Field_Mood: - tag.mood = value_ - elif field == Field_Negative: - tag.negative = value_ - elif field == Field_Number: - tag.number = value_ - elif field == Field_NameType: - tag.name_type = value_ - elif field == Field_NounType: - tag.noun_type = value_ - elif field == Field_NumForm: - tag.num_form = value_ - elif field == Field_NumType: - tag.num_type = value_ - elif field == Field_NumValue: - tag.num_value = value_ - elif field == Field_PartForm: - tag.part_form = value_ - elif field == Field_PartType: - tag.part_type = value_ - elif field == Field_Person: - tag.person = value_ - elif field == Field_Polite: - tag.polite = value_ - elif field == Field_Polarity: - tag.polarity = value_ - elif field == Field_Poss: - tag.poss = value_ - elif field == Field_Prefix: - tag.prefix = value_ - elif field == Field_PrepCase: - tag.prep_case = value_ - elif field == Field_PronType: - tag.pron_type = value_ - elif field == Field_PunctSide: - tag.punct_side = value_ - elif field == Field_PunctType: - tag.punct_type = value_ - elif field == Field_Reflex: - tag.reflex = value_ - elif field == Field_Style: - tag.style = value_ - elif field == Field_StyleVariant: - tag.style_variant = value_ - elif field == Field_Tense: - tag.tense = value_ - elif field == Field_Typo: - tag.typo = value_ - elif field == Field_VerbForm: - tag.verb_form = value_ - elif field == Field_Voice: - tag.voice = value_ - elif field == Field_VerbType: - tag.verb_type = value_ - else: - raise ValueError(Errors.E167.format(field=FEATURE_NAMES.get(feature), field_id=feature)) - - -FIELDS = { - 'POS': Field_POS, - 'Abbr': Field_Abbr, - 'AdpType': Field_AdpType, - 'AdvType': Field_AdvType, - 'Animacy': Field_Animacy, - 'Aspect': Field_Aspect, - 'Case': Field_Case, - 'ConjType': Field_ConjType, - 'Connegative': Field_Connegative, - 'Definite': Field_Definite, - 'Degree': Field_Degree, - 'Derivation': Field_Derivation, - 'Echo': Field_Echo, - 'Foreign': Field_Foreign, - 'Gender': Field_Gender, - 'Hyph': Field_Hyph, - 'InfForm': Field_InfForm, - 'Mood': Field_Mood, - 'NameType': Field_NameType, - 'Negative': Field_Negative, - 'NounType': Field_NounType, - 'Number': Field_Number, - 'NumForm': Field_NumForm, - 'NumType': Field_NumType, - 'NumValue': Field_NumValue, - 'PartForm': Field_PartForm, - 'PartType': Field_PartType, - 'Person': Field_Person, - 'Polite': Field_Polite, - 'Polarity': Field_Polarity, - 'Poss': Field_Poss, - 'Prefix': Field_Prefix, - 'PrepCase': Field_PrepCase, - 'PronType': Field_PronType, - 'PunctSide': Field_PunctSide, - 'PunctType': Field_PunctType, - 'Reflex': Field_Reflex, - 'Style': Field_Style, - 'StyleVariant': Field_StyleVariant, - 'Tense': Field_Tense, - 'Typo': Field_Typo, - 'VerbForm': Field_VerbForm, - 'VerbType': Field_VerbType, - 'Voice': Field_Voice, -} - -LOWER_FIELDS = { - 'pos': Field_POS, - 'abbr': Field_Abbr, - 'adp_type': Field_AdpType, - 'adv_type': Field_AdvType, - 'animacy': Field_Animacy, - 'aspect': Field_Aspect, - 'case': Field_Case, - 'conj_type': Field_ConjType, - 'connegative': Field_Connegative, - 'definite': Field_Definite, - 'degree': Field_Degree, - 'derivation': Field_Derivation, - 'echo': Field_Echo, - 'foreign': Field_Foreign, - 'gender': Field_Gender, - 'hyph': Field_Hyph, - 'inf_form': Field_InfForm, - 'mood': Field_Mood, - 'name_type': Field_NameType, - 'negative': Field_Negative, - 'noun_type': Field_NounType, - 'number': Field_Number, - 'num_form': Field_NumForm, - 'num_type': Field_NumType, - 'num_value': Field_NumValue, - 'part_form': Field_PartForm, - 'part_type': Field_PartType, - 'person': Field_Person, - 'polarity': Field_Polarity, - 'polite': Field_Polite, - 'poss': Field_Poss, - 'prefix': Field_Prefix, - 'prep_case': Field_PrepCase, - 'pron_type': Field_PronType, - 'punct_side': Field_PunctSide, - 'punct_type': Field_PunctType, - 'reflex': Field_Reflex, - 'style': Field_Style, - 'style_variant': Field_StyleVariant, - 'tense': Field_Tense, - 'typo': Field_Typo, - 'verb_form': Field_VerbForm, - 'verb_type': Field_VerbType, - 'voice': Field_Voice, -} - - -FEATURES = [ - "POS_ADJ", - "POS_ADP", - "POS_ADV", - "POS_AUX", - "POS_CONJ", - "POS_CCONJ", - "POS_DET", - "POS_INTJ", - "POS_NOUN", - "POS_NUM", - "POS_PART", - "POS_PRON", - "POS_PROPN", - "POS_PUNCT", - "POS_SCONJ", - "POS_SYM", - "POS_VERB", - "POS_X", - "POS_EOL", - "POS_SPACE", - "Abbr_yes", - "AdpType_circ", - "AdpType_comprep", - "AdpType_prep", - "AdpType_post", - "AdpType_voc", - "AdvType_adadj", - "AdvType_cau", - "AdvType_deg", - "AdvType_ex", - "AdvType_loc", - "AdvType_man", - "AdvType_mod", - "AdvType_sta", - "AdvType_tim", - "Animacy_anim", - "Animacy_hum", - "Animacy_inan", - "Animacy_nhum", - "Aspect_hab", - "Aspect_imp", - "Aspect_iter", - "Aspect_perf", - "Aspect_prog", - "Aspect_prosp", - "Aspect_none", - "Case_abe", - "Case_abl", - "Case_abs", - "Case_acc", - "Case_ade", - "Case_all", - "Case_cau", - "Case_com", - "Case_dat", - "Case_del", - "Case_dis", - "Case_ela", - "Case_ess", - "Case_gen", - "Case_ill", - "Case_ine", - "Case_ins", - "Case_loc", - "Case_lat", - "Case_nom", - "Case_par", - "Case_sub", - "Case_sup", - "Case_tem", - "Case_ter", - "Case_tra", - "Case_voc", - "ConjType_comp", - "ConjType_oper", - "Connegative_yes", - "Definite_cons", - "Definite_def", - "Definite_ind", - "Definite_red", - "Definite_two", - "Degree_abs", - "Degree_cmp", - "Degree_comp", - "Degree_none", - "Degree_pos", - "Degree_sup", - "Degree_com", - "Degree_dim", - "Derivation_minen", - "Derivation_sti", - "Derivation_inen", - "Derivation_lainen", - "Derivation_ja", - "Derivation_ton", - "Derivation_vs", - "Derivation_ttain", - "Derivation_ttaa", - "Echo_rdp", - "Echo_ech", - "Foreign_foreign", - "Foreign_fscript", - "Foreign_tscript", - "Foreign_yes", - "Gender_com", - "Gender_fem", - "Gender_masc", - "Gender_neut", - "Gender_dat_masc", - "Gender_dat_fem", - "Gender_erg_masc", - "Gender_erg_fem", - "Gender_psor_masc", - "Gender_psor_fem", - "Gender_psor_neut", - "Hyph_yes", - "InfForm_one", - "InfForm_two", - "InfForm_three", - "Mood_cnd", - "Mood_imp", - "Mood_ind", - "Mood_n", - "Mood_pot", - "Mood_sub", - "Mood_opt", - "NameType_geo", - "NameType_prs", - "NameType_giv", - "NameType_sur", - "NameType_nat", - "NameType_com", - "NameType_pro", - "NameType_oth", - "Negative_neg", - "Negative_pos", - "Negative_yes", - "NounType_com", - "NounType_prop", - "NounType_class", - "Number_com", - "Number_dual", - "Number_none", - "Number_plur", - "Number_sing", - "Number_ptan", - "Number_count", - "Number_abs_sing", - "Number_abs_plur", - "Number_dat_sing", - "Number_dat_plur", - "Number_erg_sing", - "Number_erg_plur", - "Number_psee_sing", - "Number_psee_plur", - "Number_psor_sing", - "Number_psor_plur", - "NumForm_digit", - "NumForm_roman", - "NumForm_word", - "NumForm_combi", - "NumType_card", - "NumType_dist", - "NumType_frac", - "NumType_gen", - "NumType_mult", - "NumType_none", - "NumType_ord", - "NumType_sets", - "NumType_dual", - "NumValue_one", - "NumValue_two", - "NumValue_three", - "PartForm_pres", - "PartForm_past", - "PartForm_agt", - "PartForm_neg", - "PartType_mod", - "PartType_emp", - "PartType_res", - "PartType_inf", - "PartType_vbp", - "Person_one", - "Person_two", - "Person_three", - "Person_none", - "Person_abs_one", - "Person_abs_two", - "Person_abs_three", - "Person_dat_one", - "Person_dat_two", - "Person_dat_three", - "Person_erg_one", - "Person_erg_two", - "Person_erg_three", - "Person_psor_one", - "Person_psor_two", - "Person_psor_three", - "Polarity_neg", - "Polarity_pos", - "Polite_inf", - "Polite_pol", - "Polite_abs_inf", - "Polite_abs_pol", - "Polite_erg_inf", - "Polite_erg_pol", - "Polite_dat_inf", - "Polite_dat_pol", - "Poss_yes", - "Prefix_yes", - "PrepCase_npr", - "PrepCase_pre", - "PronType_advPart", - "PronType_art", - "PronType_default", - "PronType_dem", - "PronType_ind", - "PronType_int", - "PronType_neg", - "PronType_prs", - "PronType_rcp", - "PronType_rel", - "PronType_tot", - "PronType_clit", - "PronType_exc", - "PunctSide_ini", - "PunctSide_fin", - "PunctType_peri", - "PunctType_qest", - "PunctType_excl", - "PunctType_quot", - "PunctType_brck", - "PunctType_comm", - "PunctType_colo", - "PunctType_semi", - "PunctType_dash", - "Reflex_yes", - "Style_arch", - "Style_rare", - "Style_poet", - "Style_norm", - "Style_coll", - "Style_vrnc", - "Style_sing", - "Style_expr", - "Style_derg", - "Style_vulg", - "Style_yes", - "StyleVariant_styleShort", - "StyleVariant_styleBound", - "Tense_fut", - "Tense_imp", - "Tense_past", - "Tense_pres", - "Typo_yes", - "VerbForm_fin", - "VerbForm_ger", - "VerbForm_inf", - "VerbForm_none", - "VerbForm_part", - "VerbForm_partFut", - "VerbForm_partPast", - "VerbForm_partPres", - "VerbForm_sup", - "VerbForm_trans", - "VerbForm_conv", - "VerbForm_gdv", - "VerbType_aux", - "VerbType_cop", - "VerbType_mod", - "VerbType_light", - "Voice_act", - "Voice_cau", - "Voice_pass", - "Voice_mid", - "Voice_int", -] - -FEATURE_NAMES = {get_string_id(f): f for f in FEATURES} -FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES} +cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil: + cdef int n_results = 0 + cdef int i + for i in range(morph.length): + if morph.fields[i] == field: + results[n_results] = morph.features[i] + n_results += 1 + return n_results diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 3925a6738..e71fb917f 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - IDS = { "": NO_TAG, diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 2f30fbbee..6a90de81c 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,10 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import SentenceRecognizer from .morphologizer import Morphologizer from .entityruler import EntityRuler +from .tok2vec import Tok2Vec from .hooks import SentenceSegmenter, SimilarityHook from .functions import merge_entities, merge_noun_chunks, merge_subtokens @@ -15,11 +14,13 @@ __all__ = [ "EntityLinker", "TextCategorizer", "Tensorizer", + "Tok2Vec", "Pipe", "Morphologizer", "EntityRuler", "Sentencizer", "SentenceSegmenter", + "SentenceRecognizer", "SimilarityHook", "merge_entities", "merge_noun_chunks", diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 1c8429049..e211acb44 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,14 +1,10 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import defaultdict, OrderedDict +from collections import defaultdict import srsly from ..language import component from ..errors import Errors -from ..compat import basestring_ from ..util import ensure_path, to_disk, from_disk -from ..tokens import Span +from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher DEFAULT_ENT_ID_SEP = "||" @@ -162,6 +158,7 @@ class EntityRuler(object): @property def patterns(self): """Get all patterns that were added to the entity ruler. + RETURNS (list): The original patterns, one dictionary per pattern. DOCS: https://spacy.io/api/entityruler#patterns @@ -194,6 +191,7 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#add_patterns """ + # disable the nlp components after this one in case they hadn't been initialized / deserialised yet try: current_index = self.nlp.pipe_names.index(self.name) @@ -203,7 +201,31 @@ class EntityRuler(object): except ValueError: subsequent_pipes = [] with self.nlp.disable_pipes(subsequent_pipes): + token_patterns = [] + phrase_pattern_labels = [] + phrase_pattern_texts = [] + phrase_pattern_ids = [] + for entry in patterns: + if isinstance(entry["pattern"], str): + phrase_pattern_labels.append(entry["label"]) + phrase_pattern_texts.append(entry["pattern"]) + phrase_pattern_ids.append(entry.get("id")) + elif isinstance(entry["pattern"], list): + token_patterns.append(entry) + + phrase_patterns = [] + for label, pattern, ent_id in zip( + phrase_pattern_labels, + self.nlp.pipe(phrase_pattern_texts), + phrase_pattern_ids, + ): + phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id} + if ent_id: + phrase_pattern["id"] = ent_id + phrase_patterns.append(phrase_pattern) + + for entry in token_patterns + phrase_patterns: label = entry["label"] if "id" in entry: ent_label = label @@ -212,8 +234,8 @@ class EntityRuler(object): self._ent_ids[key] = (ent_label, entry["id"]) pattern = entry["pattern"] - if isinstance(pattern, basestring_): - self.phrase_patterns[label].append(self.nlp(pattern)) + if isinstance(pattern, Doc): + self.phrase_patterns[label].append(pattern) elif isinstance(pattern, list): self.token_patterns[label].append(pattern) else: @@ -226,6 +248,8 @@ class EntityRuler(object): def _split_label(self, label): """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep + label (str): The value of label in a pattern entry + RETURNS (tuple): ent_label, ent_id """ if self.ent_id_sep in label: @@ -239,10 +263,13 @@ class EntityRuler(object): def _create_label(self, label, ent_id): """Join Entity label with ent_id if the pattern has an `id` attribute + label (str): The label to set for ent.label_ + ent_id (str): The label + RETURNS (str): The ent_label joined with configured `ent_id_sep` """ - if isinstance(ent_id, basestring_): - label = "{}{}{}".format(label, self.ent_id_sep, ent_id) + if isinstance(ent_id, str): + label = f"{label}{self.ent_id_sep}{ent_id}" return label def from_bytes(self, patterns_bytes, **kwargs): @@ -250,6 +277,7 @@ class EntityRuler(object): patterns_bytes (bytes): The bytestring to load. **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_bytes @@ -275,15 +303,12 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#to_bytes """ - - serial = OrderedDict( - ( - ("overwrite", self.overwrite), - ("ent_id_sep", self.ent_id_sep), - ("phrase_matcher_attr", self.phrase_matcher_attr), - ("patterns", self.patterns), - ) - ) + serial = { + "overwrite": self.overwrite, + "ent_id_sep": self.ent_id_sep, + "phrase_matcher_attr": self.phrase_matcher_attr, + "patterns": self.patterns, + } return srsly.msgpack_dumps(serial) def from_disk(self, path, **kwargs): @@ -292,6 +317,7 @@ class EntityRuler(object): path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 69e638da2..6e9d4197c 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..language import component from ..matcher import Matcher from ..util import filter_spans diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index b61a34c0e..d48b04bd1 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -1,12 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals - -from thinc.t2v import Pooling, max_pool, mean_pool -from thinc.neural._classes.difference import Siamese, CauchySimilarity +from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity from .pipes import Pipe from ..language import component -from .._ml import link_vectors_to_models +from ..util import link_vectors_to_models @component("sentencizer_hook", assigns=["doc.user_hooks"]) @@ -66,7 +62,9 @@ class SimilarityHook(Pipe): @classmethod def Model(cls, length): - return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) + return siamese( + concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2) + ) def __call__(self, doc): """Install similarity hook""" @@ -83,7 +81,7 @@ class SimilarityHook(Pipe): def update(self, doc1_doc2, golds, sgd=None, drop=0.0): self.require_model() - sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) + sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): """Allocate model, using width from tensorizer in pipeline. @@ -92,7 +90,7 @@ class SimilarityHook(Pipe): pipeline (list): The pipeline the model is part of. """ if self.model is True: - self.model = self.Model(pipeline[0].model.nO) + self.model = self.Model(pipeline[0].model.get_dim("nO")) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 72e31f120..999132b35 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,23 +1,22 @@ -from __future__ import unicode_literals -from collections import OrderedDict, defaultdict +from collections import defaultdict import numpy cimport numpy as np -from thinc.api import chain -from thinc.neural.util import to_categorical, copy_array, get_array_module +from thinc.api import chain, list2array, to_categorical, get_array_module +from thinc.util import copy_array + from .. import util from .pipes import Pipe from ..language import component -from .._ml import Tok2Vec, build_morphologizer_model -from .._ml import link_vectors_to_models, zero_init, flatten -from .._ml import create_default_optimizer +from ..util import link_vectors_to_models, create_default_optimizer from ..errors import Errors, TempErrors -from ..compat import basestring_ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology +from ..ml.component_models import build_morphologizer_model + @component("morphologizer", assigns=["token.morph", "token.pos"]) class Morphologizer(Pipe): @@ -32,7 +31,7 @@ class Morphologizer(Pipe): def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault('cnn_maxout_pieces', 2) self._class_map = self.vocab.morphology.create_class_map() @@ -45,7 +44,7 @@ class Morphologizer(Pipe): if self.model in (None, True, False): return None else: - return chain(self.model.tok2vec, flatten) + return chain(self.model.get_ref("tok2vec"), list2array()) def __call__(self, doc): features, tokvecs = self.predict([doc]) @@ -62,9 +61,9 @@ class Morphologizer(Pipe): def predict(self, docs): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. - n_labels = self.model.nO - guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] - tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) + n_labels = self.model.get_dim("nO") + guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] + tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO"))) return guesses, tokvecs tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) @@ -79,7 +78,7 @@ class Morphologizer(Pipe): for field in self._class_map.fields] for i, doc in enumerate(docs): doc_scores = batch_scores[i] - doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) + doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]) # Convert the neuron indices into feature IDs. doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i') for j in range(len(doc)): @@ -97,21 +96,22 @@ class Morphologizer(Pipe): if doc[j].morph.pos != 0: doc.c[j].pos = doc[j].morph.pos - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: losses[self.name] = 0. + docs = [self._get_doc(ex) for ex in examples] tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) - loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) + loss, d_tag_scores = self.get_loss(examples, tag_scores) bp_tag_scores(d_tag_scores, sgd=sgd) if losses is not None: losses[self.name] += loss - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): guesses = [] for doc_scores in scores: - guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) + guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])) guesses = self.model.ops.xp.vstack(guesses) scores = self.model.ops.xp.vstack(scores) if not isinstance(scores, numpy.ndarray): @@ -121,8 +121,10 @@ class Morphologizer(Pipe): cdef int idx = 0 # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') - field_sizes = self.model.softmax.out_sizes - for doc, gold in zip(docs, golds): + field_sizes = self.model.get_ref("softmax").attrs["nOs"] + for example in examples: + doc = example.doc + gold = example.gold for t, features in enumerate(gold.morphology): if features is None: target[idx] = scores[idx] @@ -146,6 +148,7 @@ class Morphologizer(Pipe): scores = self.model.ops.asarray(scores, dtype='f') d_scores = scores - target loss = (d_scores**2).sum() + docs = [self._get_doc(ex) for ex in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b4fecf5cb..ad75d2e78 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1,19 +1,12 @@ # cython: infer_types=True # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals - import numpy import srsly import random -from collections import OrderedDict -from thinc.api import chain -from thinc.v2v import Affine, Maxout, Softmax -from thinc.misc import LayerNorm -from thinc.neural.util import to_categorical -from thinc.neural.util import get_array_module +from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array +from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module +from thinc.api import set_dropout_rate -from ..compat import basestring_ from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -24,14 +17,16 @@ from ..vocab cimport Vocab from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj +from ..gold import Example from ..attrs import POS, ID +from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X from ..kb import KnowledgeBase -from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss -from .._ml import build_text_classifier, build_simple_cnn_text_classifier -from .._ml import build_bow_text_classifier, build_nel_encoder -from .._ml import link_vectors_to_models, zero_init, flatten -from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss +from ..ml.component_models import Tok2Vec, build_tagger_model +from ..ml.component_models import build_text_classifier +from ..ml.component_models import build_simple_cnn_text_classifier +from ..ml.component_models import build_bow_text_classifier, build_nel_encoder +from ..ml.component_models import masked_language_model from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -60,11 +55,17 @@ class Pipe(object): def from_nlp(cls, nlp, **cfg): return cls(nlp.vocab, **cfg) + def _get_doc(self, example): + """ Use this method if the `example` can be both a Doc or an Example """ + if isinstance(example, Doc): + return example + return example.doc + def __init__(self, vocab, model=True, **cfg): """Create a new pipe instance.""" raise NotImplementedError - def __call__(self, doc): + def __call__(self, example): """Apply the pipe to one document. The document is modified in-place, and returned. @@ -72,12 +73,16 @@ class Pipe(object): and `set_annotations()` methods. """ self.require_model() + doc = self._get_doc(example) predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: scores, tensors = predictions self.set_annotations([doc], scores, tensors=tensors) else: self.set_annotations([doc], predictions) + if isinstance(example, Example): + example.doc = doc + return example return doc def require_model(self): @@ -85,21 +90,27 @@ class Pipe(object): if getattr(self, "model", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - yield from docs + + if as_example: + for ex, doc in zip(examples, docs): + ex.doc = doc + yield ex + else: + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without @@ -112,20 +123,22 @@ class Pipe(object): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, docs, golds, drop=0.0, sgd=None, losses=None): + def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict() and get_loss(). """ + if set_annotations: + docs = (self._get_doc(ex) for ex in examples) + docs = list(self.pipe(docs)) + + def rehearse(self, examples, sgd=None, losses=None, **config): pass - def rehearse(self, docs, sgd=None, losses=None, **config): - pass - - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of - documents and their predicted scores.""" + examples (with embedded docs) and their predicted scores.""" raise NotImplementedError def add_label(self, label): @@ -138,10 +151,10 @@ class Pipe(object): raise NotImplementedError def create_optimizer(self): - return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {})) + return create_default_optimizer() def begin_training( - self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs + self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs ): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" @@ -149,10 +162,30 @@ class Pipe(object): self.model = self.Model(**self.cfg) if hasattr(self, "vocab"): link_vectors_to_models(self.vocab) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd + def get_gradients(self): + """Get non-zero gradients of the model's parameters, as a dictionary + keyed by the parameter ID. The values are (weights, gradients) tuples. + """ + gradients = {} + if self.model in (None, True, False): + return gradients + queue = [self.model] + seen = set() + for node in queue: + if node.id in seen: + continue + seen.add(node.id) + if hasattr(node, "_mem") and node._mem.gradient.any(): + gradients[node.id] = [node._mem.weights, node._mem.gradient] + if hasattr(node, "_layers"): + queue.extend(node._layers) + return gradients + def use_params(self, params): """Modify the pipe's model, to use the given parameter values.""" with self.model.use_params(params): @@ -164,7 +197,7 @@ class Pipe(object): exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized object. """ - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) if self.model not in (True, False, None): serialize["model"] = self.model.to_bytes @@ -179,7 +212,7 @@ class Pipe(object): def load_model(b): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: self.model = self.Model(**self.cfg) try: @@ -187,7 +220,7 @@ class Pipe(object): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) if hasattr(self, "vocab"): deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) @@ -198,7 +231,7 @@ class Pipe(object): def to_disk(self, path, exclude=tuple(), **kwargs): """Serialize the pipe to disk.""" - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): @@ -212,7 +245,7 @@ class Pipe(object): def load_model(p): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: self.model = self.Model(**self.cfg) try: @@ -220,7 +253,7 @@ class Pipe(object): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["model"] = load_model @@ -240,10 +273,10 @@ class Tensorizer(Pipe): width (int): Output size of the model. embed_size (int): Number of vectors in the embedding table. **cfg: Config parameters. - RETURNS (Model): A `thinc.neural.Model` or similar instance. + RETURNS (Model): A `thinc.model.Model` or similar instance. """ input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96)) - return zero_init(Affine(output_size, input_size, drop_factor=0.0)) + return Linear(output_size, input_size, init_W=zero_init) def __init__(self, vocab, model=True, **cfg): """Construct a new statistical model. Weights are not allocated on @@ -263,31 +296,40 @@ class Tensorizer(Pipe): self.model = model self.input_models = [] self.cfg = dict(cfg) - self.cfg.setdefault("cnn_maxout_pieces", 3) - def __call__(self, doc): + def __call__(self, example): """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM model. Vectors are set to the `Doc.tensor` attribute. docs (Doc or iterable): One or more documents to add vectors to. RETURNS (dict or None): Intermediate computations. """ + doc = self._get_doc(example) tokvecses = self.predict([doc]) self.set_annotations([doc], tokvecses) + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): """Process `Doc` objects as a stream. - stream (iterator): A sequence of `Doc` objects to process. - batch_size (int): Number of `Doc` objects to group. - YIELDS (iterator): A sequence of `Doc` objects, in order of input. + stream (iterator): A sequence of `Doc` or `Example` objects to process. + batch_size (int): Number of `Doc` or `Example` objects to group. + YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input. """ - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] tensors = self.predict(docs) self.set_annotations(docs, tensors) - yield from docs + + if as_example: + for ex, doc in zip(examples, docs): + ex.doc = doc + yield ex + else: + yield from docs def predict(self, docs): """Return a single tensor for a batch of documents. @@ -311,7 +353,7 @@ class Tensorizer(Pipe): raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) doc.tensor = tensor - def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): + def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. @@ -321,38 +363,44 @@ class Tensorizer(Pipe): RETURNS (dict): Results from the update. """ self.require_model() - if isinstance(docs, Doc): - docs = [docs] + examples = Example.to_example_objects(examples) inputs = [] bp_inputs = [] + set_dropout_rate(self.model, drop) for tok2vec in self.input_models: - tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop) + set_dropout_rate(tok2vec, drop) + tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples]) inputs.append(tensor) bp_inputs.append(bp_tensor) inputs = self.model.ops.xp.hstack(inputs) - scores, bp_scores = self.model.begin_update(inputs, drop=drop) - loss, d_scores = self.get_loss(docs, golds, scores) + scores, bp_scores = self.model.begin_update(inputs) + loss, d_scores = self.get_loss(examples, scores) d_inputs = bp_scores(d_scores, sgd=sgd) d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) for d_input, bp_input in zip(d_inputs, bp_inputs): - bp_input(d_input, sgd=sgd) + bp_input(d_input) + if sgd is not None: + for tok2vec in self.input_models: + tok2vec.finish_update(sgd) + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss return loss - def get_loss(self, docs, golds, prediction): - ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + def get_loss(self, examples, prediction): + examples = Example.to_example_objects(examples) + ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) target = self.vocab.vectors.data[ids] d_scores = (prediction - target) / prediction.shape[0] loss = (d_scores ** 2).sum() return loss, d_scores - def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): """Allocate models, pre-process training data and acquire an optimizer. - gold_tuples (iterable): Gold-standard training data. + get_examples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. """ if pipeline is not None: @@ -361,6 +409,7 @@ class Tensorizer(Pipe): self.input_models.append(model.tok2vec) if self.model is True: self.model = self.Model(**self.cfg) + self.model.initialize() link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() @@ -378,8 +427,7 @@ class Tagger(Pipe): self.vocab = vocab self.model = model self._rehearsal_model = None - self.cfg = OrderedDict(sorted(cfg.items())) - self.cfg.setdefault("cnn_maxout_pieces", 2) + self.cfg = dict(sorted(cfg.items())) @property def labels(self): @@ -390,39 +438,56 @@ class Tagger(Pipe): if self.model in (None, True, False): return None else: - return chain(self.model.tok2vec, flatten) + return chain(self.model.get_ref("tok2vec"), list2array()) - def __call__(self, doc): - tags, tokvecs = self.predict([doc]) - self.set_annotations([doc], tags, tensors=tokvecs) + def __call__(self, example): + doc = self._get_doc(example) + tags = self.predict([doc]) + self.set_annotations([doc], tags) + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) - tag_ids, tokvecs = self.predict(docs) - self.set_annotations(docs, tag_ids, tensors=tokvecs) - yield from docs + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] + tag_ids = self.predict(docs) + assert len(docs) == len(examples) + assert len(tag_ids) == len(examples) + self.set_annotations(docs, tag_ids) + + if as_example: + for ex, doc in zip(examples, docs): + ex.doc = doc + yield ex + else: + yield from docs def predict(self, docs): self.require_model() if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.labels) - guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] - tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) - return guesses, tokvecs - tokvecs = self.model.tok2vec(docs) - scores = self.model.softmax(tokvecs) + guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] + assert len(guesses) == len(docs) + return guesses + scores = self.model.predict(docs) + assert len(scores) == len(docs), (len(scores), len(docs)) + guesses = self._scores2guesses(scores) + assert len(guesses) == len(docs) + return guesses + + def _scores2guesses(self, scores): guesses = [] for doc_scores in scores: doc_guesses = doc_scores.argmax(axis=1) if not isinstance(doc_guesses, numpy.ndarray): doc_guesses = doc_guesses.get() guesses.append(doc_guesses) - return guesses, tokvecs + return guesses - def set_annotations(self, docs, batch_tag_ids, tensors=None): + def set_annotations(self, docs, batch_tag_ids): if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -445,55 +510,60 @@ class Tagger(Pipe): else: doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] idx += 1 - if tensors is not None and len(tensors): - if isinstance(doc.tensor, numpy.ndarray) \ - and not isinstance(tensors[i], numpy.ndarray): - doc.extend_tensor(tensors[i].get()) - else: - doc.extend_tensor(tensors[i]) doc.is_tagged = True - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): self.require_model() + examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. - if not any(len(doc) for doc in docs): + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - - tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) - loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) - bp_tag_scores(d_tag_scores, sgd=sgd) + set_dropout_rate(self.model, drop) + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) + loss, d_tag_scores = self.get_loss(examples, tag_scores) + bp_tag_scores(d_tag_scores) + if sgd not in (None, False): + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss + if set_annotations: + docs = [ex.doc for ex in examples] + self.set_annotations(docs, self._scores2guesses(tag_scores)) - def rehearse(self, docs, drop=0., sgd=None, losses=None): + def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of an initial model. """ if self._rehearsal_model is None: return + examples = Example.to_example_objects(examples) + docs = [ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return - guesses, backprop = self.model.begin_update(docs, drop=drop) - target = self._rehearsal_model(docs) + set_dropout_rate(self.model, drop) + guesses, backprop = self.model.begin_update(docs) + target = self._rehearsal_model(examples) gradient = guesses - target - backprop(gradient, sgd=sgd) + backprop(gradient) + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): scores = self.model.ops.flatten(scores) tag_index = {tag: i for i, tag in enumerate(self.labels)} cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for gold in golds: + for ex in examples: + gold = ex.gold for tag in gold.tags: if tag is None: correct[idx] = guesses[idx] @@ -504,27 +574,27 @@ class Tagger(Pipe): known_labels[idx] = 0. idx += 1 correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() + docs = [ex.doc for ex in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): user_warning(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) - new_tag_map = OrderedDict() - for raw_text, annots_brackets in get_gold_tuples(): - for annots, brackets in annots_brackets: - ids, words, tags, heads, deps, ents = annots - for tag in tags: - if tag in orig_tag_map: - new_tag_map[tag] = orig_tag_map[tag] - else: - new_tag_map[tag] = {POS: X} + new_tag_map = {} + for example in get_examples(): + for tag in example.token_annotation.tags: + if tag in orig_tag_map: + new_tag_map[tag] = orig_tag_map[tag] + else: + new_tag_map[tag] = {POS: X} + cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, @@ -536,19 +606,42 @@ class Tagger(Pipe): if hp in kwargs: self.cfg[hp] = kwargs[hp] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + # Get batch of example docs, example outputs to call begin_training(). + # This lets the model infer shapes. + n_tags = self.vocab.morphology.n_tags + for node in self.model.walk(): + # TODO: softmax hack ? + if node.name == "softmax" and node.has_dim("nO") is None: + node.set_dim("nO", n_tags) link_vectors_to_models(self.vocab) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd @classmethod - def Model(cls, n_tags, **cfg): + def Model(cls, n_tags=None, **cfg): if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"): raise ValueError(TempErrors.T008) - return build_tagger_model(n_tags, **cfg) + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] + else: + config = { + "width": cfg.get("token_vector_width", 96), + "embed_size": cfg.get("embed_size", 2000), + "pretrained_vectors": cfg.get("pretrained_vectors", None), + "window_size": cfg.get("window_size", 1), + "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), + "subword_features": cfg.get("subword_features", True), + "char_embed": cfg.get("char_embed", False), + "conv_depth": cfg.get("conv_depth", 4), + "bilstm_depth": cfg.get("bilstm_depth", 0), + } + tok2vec = Tok2Vec(**config) + return build_tagger_model(n_tags, tok2vec) def add_label(self, label, values=None): - if not isinstance(label, basestring_): + if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 @@ -578,12 +671,12 @@ class Tagger(Pipe): yield def to_bytes(self, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} if self.model not in (None, True, False): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) + tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) @@ -592,12 +685,12 @@ class Tagger(Pipe): def load_model(b): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: token_vector_width = util.env_opt( "token_vector_width", self.cfg.get("token_vector_width", 96)) - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + self.model = self.Model(**self.cfg) try: self.model.from_bytes(b) except AttributeError: @@ -610,24 +703,24 @@ class Tagger(Pipe): lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - deserialize = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("tag_map", load_tag_map), - ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), - ("model", lambda b: load_model(b)), - )) + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "tag_map": load_tag_map, + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, exclude=tuple(), **kwargs): - tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) - serialize = OrderedDict(( - ("vocab", lambda p: self.vocab.to_disk(p)), - ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), - ("cfg", lambda p: srsly.write_json(p, self.cfg)) - )) + tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "tag_map": lambda p: srsly.write_msgpack(p, tag_map), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg) + } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -635,9 +728,9 @@ class Tagger(Pipe): def load_model(p): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + self.model = self.Model(**self.cfg) with p.open("rb") as file_: try: self.model.from_bytes(file_.read()) @@ -651,12 +744,178 @@ class Tagger(Pipe): lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - deserialize = OrderedDict(( - ("cfg", lambda p: self.cfg.update(_load_cfg(p))), - ("vocab", lambda p: self.vocab.from_disk(p)), - ("tag_map", load_tag_map), - ("model", load_model), - )) + deserialize = { + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "vocab": lambda p: self.vocab.from_disk(p), + "tag_map": load_tag_map, + "model": load_model, + } + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_disk(path, deserialize, exclude) + return self + + +@component("sentrec", assigns=["token.is_sent_start"]) +class SentenceRecognizer(Tagger): + """Pipeline component for sentence segmentation. + + DOCS: https://spacy.io/api/sentencerecognizer + """ + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self._rehearsal_model = None + self.cfg = dict(sorted(cfg.items())) + self.cfg.setdefault("cnn_maxout_pieces", 2) + self.cfg.setdefault("subword_features", True) + self.cfg.setdefault("token_vector_width", 12) + self.cfg.setdefault("conv_depth", 1) + self.cfg.setdefault("pretrained_vectors", None) + + @property + def labels(self): + # labels are numbered by index internally, so this matches GoldParse + # and Example where the sentence-initial tag is 1 and other positions + # are 0 + return tuple(["I", "S"]) + + def set_annotations(self, docs, batch_tag_ids, **_): + if isinstance(docs, Doc): + docs = [docs] + cdef Doc doc + for i, doc in enumerate(docs): + doc_tag_ids = batch_tag_ids[i] + if hasattr(doc_tag_ids, "get"): + doc_tag_ids = doc_tag_ids.get() + for j, tag_id in enumerate(doc_tag_ids): + # Don't clobber existing sentence boundaries + if doc.c[j].sent_start == 0: + if tag_id == 1: + doc.c[j].sent_start = 1 + else: + doc.c[j].sent_start = -1 + + def update(self, examples, drop=0., sgd=None, losses=None): + self.require_model() + examples = Example.to_example_objects(examples) + if losses is not None and self.name not in losses: + losses[self.name] = 0. + + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + # Handle cases where there are no tokens in any docs. + return + set_dropout_rate(self.model, drop) + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) + loss, d_tag_scores = self.get_loss(examples, tag_scores) + bp_tag_scores(d_tag_scores) + if sgd is not None: + self.model.finish_update(sgd) + + if losses is not None: + losses[self.name] += loss + + def get_loss(self, examples, scores): + scores = self.model.ops.flatten(scores) + tag_index = range(len(self.labels)) + cdef int idx = 0 + correct = numpy.zeros((scores.shape[0],), dtype="i") + guesses = scores.argmax(axis=1) + known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + for ex in examples: + gold = ex.gold + for sent_start in gold.sent_starts: + if sent_start is None: + correct[idx] = guesses[idx] + elif sent_start in tag_index: + correct[idx] = sent_start + else: + correct[idx] = 0 + known_labels[idx] = 0. + idx += 1 + correct = self.model.ops.xp.array(correct, dtype="i") + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) + d_scores *= self.model.ops.asarray(known_labels) + loss = (d_scores**2).sum() + docs = [ex.doc for ex in examples] + d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + return float(loss), d_scores + + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, + **kwargs): + cdef Vocab vocab = self.vocab + if self.model is True: + for hp in ["token_vector_width", "conv_depth"]: + if hp in kwargs: + self.cfg[hp] = kwargs[hp] + self.model = self.Model(len(self.labels), **self.cfg) + if sgd is None: + sgd = self.create_optimizer() + self.model.initialize() + return sgd + + @classmethod + def Model(cls, n_tags, **cfg): + return build_tagger_model(n_tags, **cfg) + + def add_label(self, label, values=None): + raise NotImplementedError + + def use_params(self, params): + with self.model.use_params(params): + yield + + def to_bytes(self, exclude=tuple(), **kwargs): + serialize = {} + if self.model not in (None, True, False): + serialize["model"] = self.model.to_bytes + serialize["vocab"] = self.vocab.to_bytes + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def load_model(b): + if self.model is True: + self.model = self.Model(len(self.labels), **self.cfg) + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg) + } + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude=tuple(), **kwargs): + def load_model(p): + if self.model is True: + self.model = self.Model(len(self.labels), **self.cfg) + with p.open("rb") as file_: + try: + self.model.from_bytes(file_.read()) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = { + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "vocab": lambda p: self.vocab.from_disk(p), + "model": load_model, + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -701,20 +960,20 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): - gold_tuples = nonproj.preprocess_training_data(get_gold_tuples()) - for raw_text, annots_brackets in gold_tuples: - for annots, brackets in annots_brackets: - ids, words, tags, heads, deps, ents = annots - for i in range(len(ids)): - label = self.make_label(i, words, tags, heads, deps, ents) - if label is not None and label not in self.labels: - self.labels[label] = len(self.labels) + gold_examples = nonproj.preprocess_training_data(get_examples()) + # for raw_text, doc_annot in gold_tuples: + for example in gold_examples: + for i in range(len(example.token_annotation.ids)): + label = self.make_label(i, example.token_annotation) + if label is not None and label not in self.labels: + self.labels[label] = len(self.labels) if self.model is True: token_vector_width = util.env_opt("token_vector_width") self.model = self.Model(len(self.labels), tok2vec=tok2vec) link_vectors_to_models(self.vocab) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd @@ -722,14 +981,12 @@ class MultitaskObjective(Tagger): @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): token_vector_width = util.env_opt("token_vector_width", 96) - softmax = Softmax(n_tags, token_vector_width*2) model = chain( tok2vec, - LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)), - softmax + Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0), + LayerNorm(token_vector_width*2), + Softmax(nO=n_tags, nI=token_vector_width*2) ) - model.tok2vec = tok2vec - model.softmax = softmax return model def predict(self, docs): @@ -738,62 +995,61 @@ class MultitaskObjective(Tagger): scores = self.model.softmax(tokvecs) return tokvecs, scores - def get_loss(self, docs, golds, scores): - if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs), - n_golds=len(golds))) + def get_loss(self, examples, scores): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) + golds = [ex.gold for ex in examples] + docs = [ex.doc for ex in examples] for i, gold in enumerate(golds): for j in range(len(docs[i])): - # Handes alignment for tokenization differences - label = self.make_label(j, gold.words, gold.tags, - gold.heads, gold.labels, gold.ents) + # Handels alignment for tokenization differences + token_annotation = gold.get_token_annotation() + label = self.make_label(j, token_annotation) if label is None or label not in self.labels: correct[idx] = guesses[idx] else: correct[idx] = self.labels[label] idx += 1 correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) loss = (d_scores**2).sum() return float(loss), d_scores @staticmethod - def make_dep(i, words, tags, heads, deps, ents): - if deps[i] is None or heads[i] is None: + def make_dep(i, token_annotation): + if token_annotation.deps[i] is None or token_annotation.heads[i] is None: return None - return deps[i] + return token_annotation.deps[i] @staticmethod - def make_tag(i, words, tags, heads, deps, ents): - return tags[i] + def make_tag(i, token_annotation): + return token_annotation.tags[i] @staticmethod - def make_ent(i, words, tags, heads, deps, ents): - if ents is None: + def make_ent(i, token_annotation): + if token_annotation.entities is None: return None - return ents[i] + return token_annotation.entities[i] @staticmethod - def make_dep_tag_offset(i, words, tags, heads, deps, ents): - if deps[i] is None or heads[i] is None: + def make_dep_tag_offset(i, token_annotation): + if token_annotation.deps[i] is None or token_annotation.heads[i] is None: return None - offset = heads[i] - i + offset = token_annotation.heads[i] - i offset = min(offset, 2) offset = max(offset, -2) - return "%s-%s:%d" % (deps[i], tags[i], offset) + return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}" @staticmethod - def make_ent_tag(i, words, tags, heads, deps, ents): - if ents is None or ents[i] is None: + def make_ent_tag(i, token_annotation): + if token_annotation.entities is None or token_annotation.entities[i] is None: return None else: - return "%s-%s" % (tags[i], ents[i]) + return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}" @staticmethod - def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}): + def make_sent_start(target, token_annotation, cache=True, _cache={}): """A multi-task objective for representing sentence boundaries, using BILU scheme. (O is impossible) @@ -802,6 +1058,8 @@ class MultitaskObjective(Tagger): of gold data. You can pass cache=False if you know the cache will do the wrong thing. """ + words = token_annotation.words + heads = token_annotation.heads assert len(words) == len(heads) assert target < len(words), (target, len(words)) if cache: @@ -843,29 +1101,29 @@ class ClozeMultitask(Pipe): def Model(cls, vocab, tok2vec, **cfg): output_size = vocab.vectors.data.shape[1] output_layer = chain( - LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), - zero_init(Affine(output_size, output_size, drop_factor=0.0)) + Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0), + Linear(nO=output_size, nI=output_size, init_W=zero_init) ) model = chain(tok2vec, output_layer) model = masked_language_model(vocab, model) - model.tok2vec = tok2vec - model.output_layer = output_layer return model def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = cfg + self.distance = CosineDistance(ignore_zeros=True, normalize=False) def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): link_vectors_to_models(self.vocab) if self.model is True: self.model = self.Model(self.vocab, tok2vec) - X = self.model.ops.allocate((5, self.model.tok2vec.nO)) + X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) + self.model.initialize() self.model.output_layer.begin_training(X) if sgd is None: sgd = self.create_optimizer() @@ -877,26 +1135,31 @@ class ClozeMultitask(Pipe): vectors = self.model.output_layer(tokvecs) return tokvecs, vectors - def get_loss(self, docs, vectors, prediction): + def get_loss(self, examples, vectors, prediction): # The simplest way to implement this would be to vstack the # token.vector values, but that's a bit inefficient, especially on GPU. # Instead we fetch the index into the vectors table for each of our tokens, # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) target = vectors[ids] - loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) - return float(loss), gradient + gradient = self.distance.get_grad(prediction, target) + loss = self.distance.get_loss(prediction, target) + return loss, gradient - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): pass - def rehearse(self, docs, drop=0., sgd=None, losses=None): + def rehearse(self, examples, drop=0., sgd=None, losses=None): self.require_model() + examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. - predictions, bp_predictions = self.model.begin_update(docs, drop=drop) - loss, d_predictions = self.get_loss(docs, self.vocab.vectors.data, predictions) - bp_predictions(d_predictions, sgd=sgd) + set_dropout_rate(self.model, drop) + predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples]) + loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) + bp_predictions(d_predictions) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss @@ -910,19 +1173,45 @@ class TextCategorizer(Pipe): """ @classmethod - def Model(cls, nr_class=1, **cfg): - embed_size = util.env_opt("embed_size", 2000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] + def Model(cls, nr_class=1, exclusive_classes=None, **cfg): + if nr_class == 1: + exclusive_classes = False + if exclusive_classes is None: + raise ValueError( + "TextCategorizer Model must specify 'exclusive_classes'. " + "This setting determines whether the model will output " + "scores that sum to 1 for each example. If only one class " + "is true for each example, you should set exclusive_classes=True. " + "For 'multi_label' classification, set exclusive_classes=False." + ) + if "embed_size" not in cfg: + cfg["embed_size"] = util.env_opt("embed_size", 2000) + if "token_vector_width" not in cfg: + cfg["token_vector_width"] = util.env_opt("token_vector_width", 96) + if cfg.get("architecture") == "bow": + return build_bow_text_classifier(nr_class, exclusive_classes, **cfg) else: - token_vector_width = util.env_opt("token_vector_width", 96) - if cfg.get("architecture") == "simple_cnn": - tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) - return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) - elif cfg.get("architecture") == "bow": - return build_bow_text_classifier(nr_class, **cfg) - else: - return build_text_classifier(nr_class, **cfg) + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] + else: + config = { + "width": cfg.get("token_vector_width", 96), + "embed_size": cfg.get("embed_size", 2000), + "pretrained_vectors": cfg.get("pretrained_vectors", None), + "window_size": cfg.get("window_size", 1), + "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), + "subword_features": cfg.get("subword_features", True), + "char_embed": cfg.get("char_embed", False), + "conv_depth": cfg.get("conv_depth", 4), + "bilstm_depth": cfg.get("bilstm_depth", 0), + } + tok2vec = Tok2Vec(**config) + return build_simple_cnn_text_classifier( + tok2vec, + nr_class, + exclusive_classes, + **cfg + ) @property def tok2vec(self): @@ -936,6 +1225,8 @@ class TextCategorizer(Pipe): self.model = model self._rehearsal_model = None self.cfg = dict(cfg) + if "exclusive_classes" not in cfg: + self.cfg["exclusive_classes"] = True @property def labels(self): @@ -950,12 +1241,18 @@ class TextCategorizer(Pipe): def labels(self, value): self.cfg["labels"] = tuple(value) - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) - yield from docs + + if as_example: + for ex, doc in zip(examples, docs): + ex.doc = doc + yield ex + else: + yield from docs def predict(self, docs): self.require_model() @@ -967,7 +1264,7 @@ class TextCategorizer(Pipe): scores = xp.zeros((len(docs), len(self.labels))) return scores, tensors - scores = self.model(docs) + scores = self.model.predict(docs) scores = self.model.ops.asarray(scores) return scores, tensors @@ -976,33 +1273,46 @@ class TextCategorizer(Pipe): for j, label in enumerate(self.labels): doc.cats[label] = float(scores[i, j]) - def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): + def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): self.require_model() - if not any(len(doc) for doc in docs): + examples = Example.to_example_objects(examples) + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - scores, bp_scores = self.model.begin_update(docs, drop=drop) - loss, d_scores = self.get_loss(docs, golds, scores) - bp_scores(d_scores, sgd=sgd) + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update([ex.doc for ex in examples]) + loss, d_scores = self.get_loss(examples, scores) + bp_scores(d_scores) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss + if set_annotations: + docs = [ex.doc for ex in examples] + self.set_annotations(docs, scores=scores) - def rehearse(self, docs, drop=0., sgd=None, losses=None): + def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: return + examples = Example.to_example_objects(examples) + docs=[ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return - scores, bp_scores = self.model.begin_update(docs, drop=drop) - target = self._rehearsal_model(docs) + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update(docs) + target = self._rehearsal_model(examples) gradient = scores - target - bp_scores(gradient, sgd=sgd) + bp_scores(gradient) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): + golds = [ex.gold for ex in examples] truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") for i, gold in enumerate(golds): @@ -1019,7 +1329,7 @@ class TextCategorizer(Pipe): return float(mean_square_error), d_scores def add_label(self, label): - if not isinstance(label, basestring_): + if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 @@ -1030,25 +1340,27 @@ class TextCategorizer(Pipe): # - a huge problem. raise ValueError(Errors.E116) # smaller = self.model._layers[-1] - # larger = Affine(len(self.labels)+1, smaller.nI) + # larger = Linear(len(self.labels)+1, smaller.nI) # copy_array(larger.W[:smaller.nO], smaller.W) # copy_array(larger.b[:smaller.nO], smaller.b) # self.model._layers[-1] = larger self.labels = tuple(list(self.labels) + [label]) return 1 - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): - for raw_text, annot_brackets in get_gold_tuples(): - for _, (cats, _2) in annot_brackets: - for cat in cats: - self.add_label(cat) + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): + for example in get_examples(): + for cat in example.doc_annotation.cats: + self.add_label(cat) if self.model is True: - self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") + self.cfg.update(kwargs) self.require_labels() self.model = self.Model(len(self.labels), **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() + # TODO: use get_examples instead + docs = [Doc(Vocab(), words=["hello"])] + self.model.initialize(X=docs) return sgd @@ -1079,10 +1391,10 @@ cdef class DependencyParser(Parser): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) - def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model.tok2vec - labeller.begin_training(get_gold_tuples, pipeline=pipeline, + labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec, sgd=sgd) def __reduce__(self): @@ -1121,10 +1433,10 @@ cdef class EntityRecognizer(Parser): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) - def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model.tok2vec - labeller.begin_training(get_gold_tuples, pipeline=pipeline, + labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec) def __reduce__(self): @@ -1166,6 +1478,7 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + self.distance = CosineDistance(normalize=False) def set_kb(self, kb): self.kb = kb @@ -1180,37 +1493,31 @@ class EntityLinker(Pipe): if getattr(self, "kb", None) in (None, True, False): raise ValueError(Errors.E139.format(name=self.name)) - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb() self.cfg["entity_width"] = self.kb.entity_vector_length - if self.model is True: self.model = self.Model(**self.cfg) - + self.model.initialize() if sgd is None: sgd = self.create_optimizer() - return sgd - def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): + def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): self.require_model() self.require_kb() - if losses is not None: losses.setdefault(self.name, 0.0) - - if not docs or not golds: + if not examples: return 0 - - if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs), - n_golds=len(golds))) - - if isinstance(docs, Doc): - docs = [docs] - golds = [golds] - + examples = Example.to_example_objects(examples) sentence_docs = [] + docs = [ex.doc for ex in examples] + if set_annotations: + # This seems simpler than other ways to get that exact output -- but + # it does run the model twice :( + predictions = self.model.predict(docs) + golds = [ex.gold for ex in examples] for doc, gold in zip(docs, golds): ents_by_offset = dict() @@ -1227,23 +1534,27 @@ class EntityLinker(Pipe): ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): - # Currently only training on the positive instances + # Currently only training on the positive instances - we assume there is at least 1 per doc/gold if value: try: sentence_docs.append(ent.sent.as_doc()) except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) - - sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) - loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) - bp_context(d_scores, sgd=sgd) + set_dropout_rate(self.model, drop) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs) + loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) + bp_context(d_scores) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss + if set_annotations: + self.set_annotations(docs, predictions) return loss - def get_similarity_loss(self, docs, golds, scores): + def get_similarity_loss(self, golds, scores): entity_encodings = [] for gold in golds: for entity, kb_dict in gold.links.items(): @@ -1256,16 +1567,17 @@ class EntityLinker(Pipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") if scores.shape != entity_encodings.shape: - raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up")) + raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) - loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings) + gradients = self.distance.get_grad(scores, entity_encodings) + loss = self.distance.get_loss(scores, entity_encodings) loss = loss / len(entity_encodings) return loss, gradients - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): cats = [] - for gold in golds: - for entity, kb_dict in gold.links.items(): + for ex in examples: + for entity, kb_dict in ex.gold.links.items(): for kb_id, value in kb_dict.items(): cats.append([value]) @@ -1278,17 +1590,27 @@ class EntityLinker(Pipe): loss = loss / len(cats) return loss, d_scores - def __call__(self, doc): + def __call__(self, example): + doc = self._get_doc(example) kb_ids, tensors = self.predict([doc]) self.set_annotations([doc], kb_ids, tensors=tensors) + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) - yield from docs + + if as_example: + for ex, doc in zip(examples, docs): + ex.doc = doc + yield ex + else: + yield from docs def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ @@ -1312,7 +1634,7 @@ class EntityLinker(Pipe): for sent in doc.sents: sent_doc = sent.as_doc() # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] + sentence_encoding = self.model.predict([sent_doc])[0] xp = get_array_module(sentence_encoding) sentence_encoding_t = sentence_encoding.T sentence_norm = xp.linalg.norm(sentence_encoding_t) @@ -1388,7 +1710,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk(self, path, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) @@ -1411,7 +1733,7 @@ class EntityLinker(Pipe): kb.load_bulk(p) self.set_kb(kb) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["kb"] = load_kb @@ -1420,7 +1742,7 @@ class EntityLinker(Pipe): util.from_disk(path, deserialize, exclude) return self - def rehearse(self, docs, sgd=None, losses=None, **config): + def rehearse(self, examples, sgd=None, losses=None, **config): raise NotImplementedError def add_label(self, label): @@ -1428,7 +1750,7 @@ class EntityLinker(Pipe): @component("sentencizer", assigns=["token.is_sent_start", "doc.sents"]) -class Sentencizer(object): +class Sentencizer(Pipe): """Segment the Doc into sentences using a rule-based strategy. DOCS: https://spacy.io/api/sentencizer @@ -1463,24 +1785,48 @@ class Sentencizer(object): def from_nlp(cls, nlp, **cfg): return cls(**cfg) - def __call__(self, doc): + def __call__(self, example): """Apply the sentencizer to a Doc and set Token.is_sent_start. - doc (Doc): The document to process. - RETURNS (Doc): The processed Doc. + example (Doc or Example): The document to process. + RETURNS (Doc or Example): The processed Doc or Example. DOCS: https://spacy.io/api/sentencizer#call """ - tags = self.predict([doc]) - self.set_annotations([doc], tags) + doc = self._get_doc(example) + start = 0 + seen_period = False + for i, token in enumerate(doc): + is_in_punct_chars = token.text in self.punct_chars + token.is_sent_start = i == 0 + if seen_period and not token.is_punct and not is_in_punct_chars: + doc[start].is_sent_start = True + start = token.i + seen_period = False + elif is_in_punct_chars: + seen_period = True + if start < len(doc): + doc[start].is_sent_start = True + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) - tag_ids = self.predict(docs) - self.set_annotations(docs, tag_ids) - yield from docs + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] + predictions = self.predict(docs) + if isinstance(predictions, tuple) and len(tuple) == 2: + scores, tensors = predictions + self.set_annotations(docs, scores, tensors=tensors) + else: + self.set_annotations(docs, predictions) + if as_example: + for ex, doc in zip(examples, docs): + ex.doc = doc + yield ex + else: + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without @@ -1573,4 +1919,4 @@ Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py new file mode 100644 index 000000000..8290468cf --- /dev/null +++ b/spacy/pipeline/tok2vec.py @@ -0,0 +1,193 @@ +from thinc.api import Model, set_dropout_rate + +from .pipes import Pipe +from ..gold import Example +from ..tokens import Doc +from ..vocab import Vocab +from ..language import component +from ..util import link_vectors_to_models, minibatch, registry, eg2doc + + +@component("tok2vec", assigns=["doc.tensor"]) +class Tok2Vec(Pipe): + @classmethod + def from_nlp(cls, nlp, **cfg): + return cls(nlp.vocab, **cfg) + + @classmethod + def Model(cls, architecture, **cfg): + """Create a new statistical model for the class. + + architecture (str): The registered model architecture to use. + **cfg: Config parameters. + RETURNS (Model): A `thinc.model.Model` or similar instance. + """ + model = registry.architectures.get(architecture) + return model(**cfg) + + def __init__(self, vocab, model=True, **cfg): + """Construct a new statistical model. Weights are not allocated on + initialisation. + vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` + instance with the `Doc` objects it will process. + model (Model): A `Model` instance or `True` to allocate one later. + **cfg: Config parameters. + """ + self.vocab = vocab + self.model = model + self.cfg = dict(cfg) + self.listeners = [] + + def create_listener(self): + listener = Tok2VecListener( + upstream_name="tok2vec", width=self.model.get_dim("nO") + ) + self.listeners.append(listener) + + def add_listener(self, listener): + self.listeners.append(listener) + + def find_listeners(self, model): + for node in model.walk(): + if isinstance(node, Tok2VecListener) and node.upstream_name == self.name: + self.add_listener(node) + + def __call__(self, doc): + """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM + model. Vectors are set to the `Doc.tensor` attribute. + docs (Doc or iterable): One or more documents to add vectors to. + RETURNS (dict or None): Intermediate computations. + """ + tokvecses = self.predict([doc]) + self.set_annotations([doc], tokvecses) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + """Process `Doc` objects as a stream. + stream (iterator): A sequence of `Doc` objects to process. + batch_size (int): Number of `Doc` objects to group. + n_threads (int): Number of threads. + YIELDS (iterator): A sequence of `Doc` objects, in order of input. + """ + for batch in minibatch(stream, batch_size): + batch = list(batch) + if as_example: + docs = [eg2doc(doc) for doc in batch] + else: + docs = batch + tokvecses = self.predict(docs) + self.set_annotations(docs, tokvecses) + yield from batch + + def predict(self, docs): + """Return a single tensor for a batch of documents. + docs (iterable): A sequence of `Doc` objects. + RETURNS (object): Vector representations for each token in the documents. + """ + tokvecs = self.model.predict(docs) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners: + listener.receive(batch_id, tokvecs, None) + return tokvecs + + def set_annotations(self, docs, tokvecses): + """Set the tensor attribute for a batch of documents. + docs (iterable): A sequence of `Doc` objects. + tokvecs (object): Vector representation for each token in the documents. + """ + for doc, tokvecs in zip(docs, tokvecses): + assert tokvecs.shape[0] == len(doc) + doc.tensor = tokvecs + + def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False): + """Update the model. + examples (iterable): A batch of examples + drop (float): The droput rate. + sgd (callable): An optimizer. + RETURNS (dict): Results from the update. + """ + if losses is None: + losses = {} + examples = Example.to_example_objects(examples) + docs = [eg.doc for eg in examples] + if isinstance(docs, Doc): + docs = [docs] + set_dropout_rate(self.model, drop) + tokvecs, bp_tokvecs = self.model.begin_update(docs) + + def capture_losses(d_tokvecs): + """Accumulate tok2vec loss before doing backprop.""" + l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs) + if self.name in losses: + losses[self.name] += l2_loss / len(d_tokvecs) + else: + losses[self.name] = l2_loss / len(d_tokvecs) + return bp_tokvecs(d_tokvecs) + + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners: + listener.receive(batch_id, tokvecs, capture_losses) + if sgd is not None: + self.model.finish_update(sgd) + if set_annotations: + self.set_annotations(docs, tokvecs) + + def get_loss(self, docs, golds, scores): + pass + + def begin_training( + self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs + ): + """Allocate models and pre-process training data + + get_examples (function): Function returning example training data. + pipeline (list): The pipeline the model is part of. + """ + if self.model is True: + self.model = self.Model(**self.cfg) + # TODO: use examples instead ? + docs = [Doc(Vocab(), words=["hello"])] + self.model.initialize(X=docs) + link_vectors_to_models(self.vocab) + + +class Tok2VecListener(Model): + """A layer that gets fed its answers from an upstream connection, + for instance from a component earlier in the pipeline. + """ + + name = "tok2vec-listener" + + def __init__(self, upstream_name, width): + Model.__init__(self, name=self.name, forward=forward, dims={"nO": width}) + self.upstream_name = upstream_name + self._batch_id = None + self._outputs = None + self._backprop = None + + @classmethod + def get_batch_id(cls, inputs): + return sum(sum(token.orth for token in doc) for doc in inputs) + + def receive(self, batch_id, outputs, backprop): + self._batch_id = batch_id + self._outputs = outputs + self._backprop = backprop + + def verify_inputs(self, inputs): + if self._batch_id is None and self._outputs is None: + raise ValueError + else: + batch_id = self.get_batch_id(inputs) + if batch_id != self._batch_id: + raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}") + else: + return True + + +def forward(model: Tok2VecListener, inputs, is_train): + if is_train: + model.verify_inputs(inputs) + return model._outputs, model._backprop + else: + return [doc.tensor for doc in inputs], lambda dX: [] diff --git a/spacy/schemas.py b/spacy/schemas.py new file mode 100644 index 000000000..2268bf100 --- /dev/null +++ b/spacy/schemas.py @@ -0,0 +1,189 @@ +from typing import Dict, List, Union, Optional +from enum import Enum +from pydantic import BaseModel, Field, ValidationError, validator +from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from collections import defaultdict + +from .attrs import NAMES + + +def validate(schema, obj): + """Validate data against a given pydantic schema. + + obj (dict): JSON-serializable data to validate. + schema (pydantic.BaseModel): The schema to validate against. + RETURNS (list): A list of error messages, if available. + """ + try: + schema(**obj) + return [] + except ValidationError as e: + errors = e.errors() + data = defaultdict(list) + for error in errors: + err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) + data[err_loc].append(error.get("msg")) + return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] + + +# Matcher token patterns + + +def validate_token_pattern(obj): + # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"}) + get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k + if isinstance(obj, list): + converted = [] + for pattern in obj: + if isinstance(pattern, dict): + pattern = {get_key(k): v for k, v in pattern.items()} + converted.append(pattern) + obj = converted + return validate(TokenPatternSchema, {"pattern": obj}) + + +class TokenPatternString(BaseModel): + REGEX: Optional[StrictStr] + IN: Optional[List[StrictStr]] + NOT_IN: Optional[List[StrictStr]] + + class Config: + extra = "forbid" + + @validator("*", pre=True, whole=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternNumber(BaseModel): + REGEX: Optional[StrictStr] = None + IN: Optional[List[StrictInt]] = None + NOT_IN: Optional[List[StrictInt]] = None + EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") + GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") + LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") + GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") + LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") + + class Config: + extra = "forbid" + + @validator("*", pre=True, whole=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternOperator(str, Enum): + plus: StrictStr = "+" + start: StrictStr = "*" + question: StrictStr = "?" + exclamation: StrictStr = "!" + + +StringValue = Union[TokenPatternString, StrictStr] +NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] +UnderscoreValue = Union[ + TokenPatternString, TokenPatternNumber, str, int, float, list, bool, +] + + +class TokenPattern(BaseModel): + orth: Optional[StringValue] = None + text: Optional[StringValue] = None + lower: Optional[StringValue] = None + pos: Optional[StringValue] = None + tag: Optional[StringValue] = None + dep: Optional[StringValue] = None + lemma: Optional[StringValue] = None + shape: Optional[StringValue] = None + ent_type: Optional[StringValue] = None + norm: Optional[StringValue] = None + length: Optional[NumberValue] = None + spacy: Optional[StrictBool] = None + is_alpha: Optional[StrictBool] = None + is_ascii: Optional[StrictBool] = None + is_digit: Optional[StrictBool] = None + is_lower: Optional[StrictBool] = None + is_upper: Optional[StrictBool] = None + is_title: Optional[StrictBool] = None + is_punct: Optional[StrictBool] = None + is_space: Optional[StrictBool] = None + is_bracket: Optional[StrictBool] = None + is_quote: Optional[StrictBool] = None + is_left_punct: Optional[StrictBool] = None + is_right_punct: Optional[StrictBool] = None + is_currency: Optional[StrictBool] = None + is_stop: Optional[StrictBool] = None + is_sent_start: Optional[StrictBool] = None + like_num: Optional[StrictBool] = None + like_url: Optional[StrictBool] = None + like_email: Optional[StrictBool] = None + op: Optional[TokenPatternOperator] = None + underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") + + class Config: + extra = "forbid" + allow_population_by_field_name = True + alias_generator = lambda value: value.upper() + + @validator("*", pre=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternSchema(BaseModel): + pattern: List[TokenPattern] = Field(..., minItems=1) + + class Config: + extra = "forbid" + + +# Model meta + + +class ModelMetaSchema(BaseModel): + # fmt: off + lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'") + name: StrictStr = Field(..., title="Model name") + version: StrictStr = Field(..., title="Model version") + spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier") + parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") + pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components") + description: Optional[StrictStr] = Field(None, title="Model description") + license: Optional[StrictStr] = Field(None, title="Model license") + author: Optional[StrictStr] = Field(None, title="Model author name") + email: Optional[StrictStr] = Field(None, title="Model author email") + url: Optional[StrictStr] = Field(None, title="Model author URL") + sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") + vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors") + accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") + speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") + # fmt: on + + +# Training data object in "simple training style" + + +class SimpleTrainingSchema(BaseModel): + # TODO: write + + class Config: + title = "Schema for training data dict in passed to nlp.update" + extra = "forbid" + + +# JSON training format + + +class TrainingSchema(BaseModel): + # TODO: write + + class Config: + title = "Schema for training data in spaCy's JSON format" + extra = "forbid" diff --git a/spacy/scorer.py b/spacy/scorer.py index 7b05b11fd..82b10a77d 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,9 +1,6 @@ -# coding: utf8 -from __future__ import division, print_function, unicode_literals - import numpy as np -from .gold import tags_to_entities, GoldParse +from .gold import tags_to_entities, GoldParse, DocAnnotation from .errors import Errors @@ -84,6 +81,7 @@ class Scorer(object): self.labelled = PRFScore() self.labelled_per_dep = dict() self.tags = PRFScore() + self.sent_starts = PRFScore() self.ner = PRFScore() self.ner_per_ents = dict() self.eval_punct = eval_punct @@ -113,6 +111,27 @@ class Scorer(object): """ return self.tags.fscore * 100 + @property + def sent_p(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.precision * 100 + + @property + def sent_r(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.recall * 100 + + @property + def sent_f(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.fscore * 100 + @property def token_acc(self): """RETURNS (float): Tokenization accuracy.""" @@ -212,16 +231,18 @@ class Scorer(object): "ents_f": self.ents_f, "ents_per_type": self.ents_per_type, "tags_acc": self.tags_acc, + "sent_p": self.sent_p, + "sent_r": self.sent_r, + "sent_f": self.sent_f, "token_acc": self.token_acc, "textcat_score": self.textcat_score, "textcats_per_cat": self.textcats_per_cat, } - def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): + def score(self, example, verbose=False, punct_labels=("p", "punct")): """Update the evaluation scores from a single Doc / GoldParse pair. - doc (Doc): The predicted annotations. - gold (GoldParse): The correct annotations. + example (Example): The predicted annotations + correct annotations. verbose (bool): Print debugging information. punct_labels (tuple): Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is @@ -229,16 +250,28 @@ class Scorer(object): DOCS: https://spacy.io/api/scorer#score """ + if isinstance(example, tuple) and len(example) == 2: + doc, gold = example + else: + gold = example.gold + doc = example.doc + if len(doc) != len(gold): - gold = GoldParse.from_annot_tuples( - doc, tuple(zip(*gold.orig_annot)) + (gold.cats,) - ) + doc_annotation = DocAnnotation(cats=gold.cats) + token_annotation = gold.orig + gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation) + orig = gold.orig gold_deps = set() gold_deps_per_dep = {} gold_tags = set() - gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) - for id_, word, tag, head, dep, ner in gold.orig_annot: + gold_sent_starts = set() + gold_ents = set(tags_to_entities(orig.entities)) + for id_, tag, head, dep, sent_start in zip( + orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts + ): gold_tags.add((id_, tag)) + if sent_start: + gold_sent_starts.add(id_) if dep not in (None, "") and dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) if dep.lower() not in self.labelled_per_dep: @@ -249,6 +282,7 @@ class Scorer(object): cand_deps = set() cand_deps_per_dep = {} cand_tags = set() + cand_sent_starts = set() for token in doc: if token.orth_.isspace(): continue @@ -258,6 +292,8 @@ class Scorer(object): else: self.tokens.tp += 1 cand_tags.add((gold_i, token.tag_)) + if token.is_sent_start: + cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): gold_head = gold.cand_to_gold[token.head.i] # None is indistinct, so we can't just add it to the set @@ -274,7 +310,7 @@ class Scorer(object): cand_deps_per_dep[token.dep_.lower()].add( (gold_i, gold_head, token.dep_.lower()) ) - if "-" not in [token[-1] for token in gold.orig_annot]: + if "-" not in [token[-1] for token in orig.entities]: # Find all NER labels in gold and doc ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) # Set up all labels for per type scoring and prepare gold per type @@ -304,6 +340,7 @@ class Scorer(object): # Score for all ents self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) + self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: self.labelled_per_dep[dep].score_set( @@ -340,7 +377,7 @@ class Scorer(object): Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) ) if verbose: - gold_words = [item[1] for item in gold.orig_annot] + gold_words = orig.words for w_id, h_id, dep in cand_deps - gold_deps: print("F", gold_words[w_id], dep, gold_words[h_id]) for w_id, h_id, dep in gold_deps - cand_deps: diff --git a/spacy/strings.pyx b/spacy/strings.pyx index f3457e1a5..0605de96c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,7 +1,4 @@ # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals, absolute_import - cimport cython from libc.string cimport memcpy from libcpp.set cimport set @@ -9,7 +6,6 @@ from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 import srsly -from .compat import basestring_ from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT from .typedefs cimport hash_t @@ -24,7 +20,7 @@ def get_string_id(key): This function optimises for convenience over performance, so shouldn't be used in tight loops. """ - if not isinstance(key, basestring_): + if not isinstance(key, str): return key elif key in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[key] @@ -150,7 +146,7 @@ cdef class StringStore: return key else: return self[key] - + def add(self, string): """Add a string to the StringStore. diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b3878db3f..259fd657d 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -82,52 +82,11 @@ cdef struct TokenC: cdef struct MorphAnalysisC: - univ_pos_t pos + hash_t key int length - - attr_t abbr - attr_t adp_type - attr_t adv_type - attr_t animacy - attr_t aspect - attr_t case - attr_t conj_type - attr_t connegative - attr_t definite - attr_t degree - attr_t derivation - attr_t echo - attr_t foreign - attr_t gender - attr_t hyph - attr_t inf_form - attr_t mood - attr_t negative - attr_t number - attr_t name_type - attr_t noun_type - attr_t num_form - attr_t num_type - attr_t num_value - attr_t part_form - attr_t part_type - attr_t person - attr_t polite - attr_t polarity - attr_t poss - attr_t prefix - attr_t prep_case - attr_t pron_type - attr_t punct_side - attr_t punct_type - attr_t reflex - attr_t style - attr_t style_variant - attr_t tense - attr_t typo - attr_t verb_form - attr_t voice - attr_t verb_type + attr_t* fields + attr_t* features + # Internal struct, for storage and disambiguation of entities. cdef struct KBEntryC: diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b6391af11..ad645afcf 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -108,282 +108,282 @@ cdef enum symbol_t: EOL SPACE - Animacy_anim - Animacy_inan - Animacy_hum # U20 - Animacy_nhum - Aspect_freq - Aspect_imp - Aspect_mod - Aspect_none - Aspect_perf - Aspect_iter # U20 - Aspect_hab # U20 - Case_abe - Case_abl - Case_abs - Case_acc - Case_ade - Case_all - Case_cau - Case_com - Case_cmp # U20 - Case_dat - Case_del - Case_dis - Case_ela - Case_equ # U20 - Case_ess - Case_gen - Case_ill - Case_ine - Case_ins - Case_loc - Case_lat - Case_nom - Case_par - Case_sub - Case_sup - Case_tem - Case_ter - Case_tra - Case_voc - Definite_two - Definite_def - Definite_red - Definite_cons # U20 - Definite_ind - Definite_spec # U20 - Degree_cmp - Degree_comp - Degree_none - Degree_pos - Degree_sup - Degree_abs - Degree_com - Degree_dim # du - Degree_equ # U20 - Evident_nfh # U20 - Gender_com - Gender_fem - Gender_masc - Gender_neut - Mood_cnd - Mood_imp - Mood_ind - Mood_n - Mood_pot - Mood_sub - Mood_opt - Mood_prp # U20 - Mood_adm # U20 - Negative_neg - Negative_pos - Negative_yes - Polarity_neg # U20 - Polarity_pos # U20 - Number_com - Number_dual - Number_none - Number_plur - Number_sing - Number_ptan # bg - Number_count # bg, U20 - Number_tri # U20 - NumType_card - NumType_dist - NumType_frac - NumType_gen - NumType_mult - NumType_none - NumType_ord - NumType_sets - Person_one - Person_two - Person_three - Person_none - Poss_yes - PronType_advPart - PronType_art - PronType_default - PronType_dem - PronType_ind - PronType_int - PronType_neg - PronType_prs - PronType_rcp - PronType_rel - PronType_tot - PronType_clit - PronType_exc # es, ca, it, fa, U20 - PronType_emp # U20 - Reflex_yes - Tense_fut - Tense_imp - Tense_past - Tense_pres - VerbForm_fin - VerbForm_ger - VerbForm_inf - VerbForm_none - VerbForm_part - VerbForm_partFut - VerbForm_partPast - VerbForm_partPres - VerbForm_sup - VerbForm_trans - VerbForm_conv # U20 - VerbForm_gdv # la - VerbForm_vnoun # U20 - Voice_act - Voice_cau - Voice_pass - Voice_mid # gkc, U20 - Voice_int # hb - Voice_antip # U20 - Voice_dir # U20 - Voice_inv # U20 - Abbr_yes # cz, fi, sl, U - AdpType_prep # cz, U - AdpType_post # U - AdpType_voc # cz - AdpType_comprep # cz - AdpType_circ # U - AdvType_man - AdvType_loc - AdvType_tim - AdvType_deg - AdvType_cau - AdvType_mod - AdvType_sta - AdvType_ex - AdvType_adadj - ConjType_oper # cz, U - ConjType_comp # cz, U - Connegative_yes # fi - Derivation_minen # fi - Derivation_sti # fi - Derivation_inen # fi - Derivation_lainen # fi - Derivation_ja # fi - Derivation_ton # fi - Derivation_vs # fi - Derivation_ttain # fi - Derivation_ttaa # fi - Echo_rdp # U - Echo_ech # U - Foreign_foreign # cz, fi, U - Foreign_fscript # cz, fi, U - Foreign_tscript # cz, U - Foreign_yes # sl - Gender_dat_masc # bq, U - Gender_dat_fem # bq, U - Gender_erg_masc # bq - Gender_erg_fem # bq - Gender_psor_masc # cz, sl, U - Gender_psor_fem # cz, sl, U - Gender_psor_neut # sl - Hyph_yes # cz, U - InfForm_one # fi - InfForm_two # fi - InfForm_three # fi - NameType_geo # U, cz - NameType_prs # U, cz - NameType_giv # U, cz - NameType_sur # U, cz - NameType_nat # U, cz - NameType_com # U, cz - NameType_pro # U, cz - NameType_oth # U, cz - NounType_com # U - NounType_prop # U - NounType_class # U - Number_abs_sing # bq, U - Number_abs_plur # bq, U - Number_dat_sing # bq, U - Number_dat_plur # bq, U - Number_erg_sing # bq, U - Number_erg_plur # bq, U - Number_psee_sing # U - Number_psee_plur # U - Number_psor_sing # cz, fi, sl, U - Number_psor_plur # cz, fi, sl, U - Number_pauc # U20 - Number_grpa # U20 - Number_grpl # U20 - Number_inv # U20 - NumForm_digit # cz, sl, U - NumForm_roman # cz, sl, U - NumForm_word # cz, sl, U - NumValue_one # cz, U - NumValue_two # cz, U - NumValue_three # cz, U - PartForm_pres # fi - PartForm_past # fi - PartForm_agt # fi - PartForm_neg # fi - PartType_mod # U - PartType_emp # U - PartType_res # U - PartType_inf # U - PartType_vbp # U - Person_abs_one # bq, U - Person_abs_two # bq, U - Person_abs_three # bq, U - Person_dat_one # bq, U - Person_dat_two # bq, U - Person_dat_three # bq, U - Person_erg_one # bq, U - Person_erg_two # bq, U - Person_erg_three # bq, U - Person_psor_one # fi, U - Person_psor_two # fi, U - Person_psor_three # fi, U - Person_zero # U20 - Person_four # U20 - Polite_inf # bq, U - Polite_pol # bq, U - Polite_abs_inf # bq, U - Polite_abs_pol # bq, U - Polite_erg_inf # bq, U - Polite_erg_pol # bq, U - Polite_dat_inf # bq, U - Polite_dat_pol # bq, U - Polite_infm # U20 - Polite_form # U20 - Polite_form_elev # U20 - Polite_form_humb # U20 - Prefix_yes # U - PrepCase_npr # cz - PrepCase_pre # U - PunctSide_ini # U - PunctSide_fin # U - PunctType_peri # U - PunctType_qest # U - PunctType_excl # U - PunctType_quot # U - PunctType_brck # U - PunctType_comm # U - PunctType_colo # U - PunctType_semi # U - PunctType_dash # U - Style_arch # cz, fi, U - Style_rare # cz, fi, U - Style_poet # cz, U - Style_norm # cz, U - Style_coll # cz, U - Style_vrnc # cz, U - Style_sing # cz, U - Style_expr # cz, U - Style_derg # cz, U - Style_vulg # cz, U - Style_yes # fi, U - StyleVariant_styleShort # cz - StyleVariant_styleBound # cz, sl - VerbType_aux # U - VerbType_cop # U - VerbType_mod # U - VerbType_light # U + DEPRECATED001 + DEPRECATED002 + DEPRECATED003 + DEPRECATED004 + DEPRECATED005 + DEPRECATED006 + DEPRECATED007 + DEPRECATED008 + DEPRECATED009 + DEPRECATED010 + DEPRECATED011 + DEPRECATED012 + DEPRECATED013 + DEPRECATED014 + DEPRECATED015 + DEPRECATED016 + DEPRECATED017 + DEPRECATED018 + DEPRECATED019 + DEPRECATED020 + DEPRECATED021 + DEPRECATED022 + DEPRECATED023 + DEPRECATED024 + DEPRECATED025 + DEPRECATED026 + DEPRECATED027 + DEPRECATED028 + DEPRECATED029 + DEPRECATED030 + DEPRECATED031 + DEPRECATED032 + DEPRECATED033 + DEPRECATED034 + DEPRECATED035 + DEPRECATED036 + DEPRECATED037 + DEPRECATED038 + DEPRECATED039 + DEPRECATED040 + DEPRECATED041 + DEPRECATED042 + DEPRECATED043 + DEPRECATED044 + DEPRECATED045 + DEPRECATED046 + DEPRECATED047 + DEPRECATED048 + DEPRECATED049 + DEPRECATED050 + DEPRECATED051 + DEPRECATED052 + DEPRECATED053 + DEPRECATED054 + DEPRECATED055 + DEPRECATED056 + DEPRECATED057 + DEPRECATED058 + DEPRECATED059 + DEPRECATED060 + DEPRECATED061 + DEPRECATED062 + DEPRECATED063 + DEPRECATED064 + DEPRECATED065 + DEPRECATED066 + DEPRECATED067 + DEPRECATED068 + DEPRECATED069 + DEPRECATED070 + DEPRECATED071 + DEPRECATED072 + DEPRECATED073 + DEPRECATED074 + DEPRECATED075 + DEPRECATED076 + DEPRECATED077 + DEPRECATED078 + DEPRECATED079 + DEPRECATED080 + DEPRECATED081 + DEPRECATED082 + DEPRECATED083 + DEPRECATED084 + DEPRECATED085 + DEPRECATED086 + DEPRECATED087 + DEPRECATED088 + DEPRECATED089 + DEPRECATED090 + DEPRECATED091 + DEPRECATED092 + DEPRECATED093 + DEPRECATED094 + DEPRECATED095 + DEPRECATED096 + DEPRECATED097 + DEPRECATED098 + DEPRECATED099 + DEPRECATED100 + DEPRECATED101 + DEPRECATED102 + DEPRECATED103 + DEPRECATED104 + DEPRECATED105 + DEPRECATED106 + DEPRECATED107 + DEPRECATED108 + DEPRECATED109 + DEPRECATED110 + DEPRECATED111 + DEPRECATED112 + DEPRECATED113 + DEPRECATED114 + DEPRECATED115 + DEPRECATED116 + DEPRECATED117 + DEPRECATED118 + DEPRECATED119 + DEPRECATED120 + DEPRECATED121 + DEPRECATED122 + DEPRECATED123 + DEPRECATED124 + DEPRECATED125 + DEPRECATED126 + DEPRECATED127 + DEPRECATED128 + DEPRECATED129 + DEPRECATED130 + DEPRECATED131 + DEPRECATED132 + DEPRECATED133 + DEPRECATED134 + DEPRECATED135 + DEPRECATED136 + DEPRECATED137 + DEPRECATED138 + DEPRECATED139 + DEPRECATED140 + DEPRECATED141 + DEPRECATED142 + DEPRECATED143 + DEPRECATED144 + DEPRECATED145 + DEPRECATED146 + DEPRECATED147 + DEPRECATED148 + DEPRECATED149 + DEPRECATED150 + DEPRECATED151 + DEPRECATED152 + DEPRECATED153 + DEPRECATED154 + DEPRECATED155 + DEPRECATED156 + DEPRECATED157 + DEPRECATED158 + DEPRECATED159 + DEPRECATED160 + DEPRECATED161 + DEPRECATED162 + DEPRECATED163 + DEPRECATED164 + DEPRECATED165 + DEPRECATED166 + DEPRECATED167 + DEPRECATED168 + DEPRECATED169 + DEPRECATED170 + DEPRECATED171 + DEPRECATED172 + DEPRECATED173 + DEPRECATED174 + DEPRECATED175 + DEPRECATED176 + DEPRECATED177 + DEPRECATED178 + DEPRECATED179 + DEPRECATED180 + DEPRECATED181 + DEPRECATED182 + DEPRECATED183 + DEPRECATED184 + DEPRECATED185 + DEPRECATED186 + DEPRECATED187 + DEPRECATED188 + DEPRECATED189 + DEPRECATED190 + DEPRECATED191 + DEPRECATED192 + DEPRECATED193 + DEPRECATED194 + DEPRECATED195 + DEPRECATED196 + DEPRECATED197 + DEPRECATED198 + DEPRECATED199 + DEPRECATED200 + DEPRECATED201 + DEPRECATED202 + DEPRECATED203 + DEPRECATED204 + DEPRECATED205 + DEPRECATED206 + DEPRECATED207 + DEPRECATED208 + DEPRECATED209 + DEPRECATED210 + DEPRECATED211 + DEPRECATED212 + DEPRECATED213 + DEPRECATED214 + DEPRECATED215 + DEPRECATED216 + DEPRECATED217 + DEPRECATED218 + DEPRECATED219 + DEPRECATED220 + DEPRECATED221 + DEPRECATED222 + DEPRECATED223 + DEPRECATED224 + DEPRECATED225 + DEPRECATED226 + DEPRECATED227 + DEPRECATED228 + DEPRECATED229 + DEPRECATED230 + DEPRECATED231 + DEPRECATED232 + DEPRECATED233 + DEPRECATED234 + DEPRECATED235 + DEPRECATED236 + DEPRECATED237 + DEPRECATED238 + DEPRECATED239 + DEPRECATED240 + DEPRECATED241 + DEPRECATED242 + DEPRECATED243 + DEPRECATED244 + DEPRECATED245 + DEPRECATED246 + DEPRECATED247 + DEPRECATED248 + DEPRECATED249 + DEPRECATED250 + DEPRECATED251 + DEPRECATED252 + DEPRECATED253 + DEPRECATED254 + DEPRECATED255 + DEPRECATED256 + DEPRECATED257 + DEPRECATED258 + DEPRECATED259 + DEPRECATED260 + DEPRECATED261 + DEPRECATED262 + DEPRECATED263 + DEPRECATED264 + DEPRECATED265 + DEPRECATED266 + DEPRECATED267 + DEPRECATED268 + DEPRECATED269 + DEPRECATED270 + DEPRECATED271 + DEPRECATED272 + DEPRECATED273 + DEPRECATED274 + DEPRECATED275 + DEPRECATED276 PERSON NORP @@ -462,4 +462,5 @@ cdef enum symbol_t: acl ENT_KB_ID + MORPH ENT_ID diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index d82cf036d..8ed669dcd 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,8 +1,4 @@ -# coding: utf8 -#cython: optimize.unpack_method_calls=False -from __future__ import unicode_literals - - +# cython: optimize.unpack_method_calls=False IDS = { "": NIL, "IS_ALPHA": IS_ALPHA, @@ -115,282 +111,282 @@ IDS = { "EOL": EOL, "SPACE": SPACE, - "Animacy_anim": Animacy_anim, - "Animacy_inam": Animacy_inan, - "Animacy_hum": Animacy_hum, # U20 - "Animacy_nhum": Animacy_nhum, - "Aspect_freq": Aspect_freq, - "Aspect_imp": Aspect_imp, - "Aspect_mod": Aspect_mod, - "Aspect_none": Aspect_none, - "Aspect_perf": Aspect_perf, - "Aspect_iter": Aspect_iter, # U20 - "Aspect_hab": Aspect_hab, # U20 - "Case_abe": Case_abe, - "Case_abl": Case_abl, - "Case_abs": Case_abs, - "Case_acc": Case_acc, - "Case_ade": Case_ade, - "Case_all": Case_all, - "Case_cau": Case_cau, - "Case_com": Case_com, - "Case_cmp": Case_cmp, # U20 - "Case_dat": Case_dat, - "Case_del": Case_del, - "Case_dis": Case_dis, - "Case_ela": Case_ela, - "Case_equ": Case_equ, # U20 - "Case_ess": Case_ess, - "Case_gen": Case_gen, - "Case_ill": Case_ill, - "Case_ine": Case_ine, - "Case_ins": Case_ins, - "Case_loc": Case_loc, - "Case_lat": Case_lat, - "Case_nom": Case_nom, - "Case_par": Case_par, - "Case_sub": Case_sub, - "Case_sup": Case_sup, - "Case_tem": Case_tem, - "Case_ter": Case_ter, - "Case_tra": Case_tra, - "Case_voc": Case_voc, - "Definite_two": Definite_two, - "Definite_def": Definite_def, - "Definite_red": Definite_red, - "Definite_cons": Definite_cons, # U20 - "Definite_ind": Definite_ind, - "Definite_spec": Definite_spec, # U20 - "Degree_cmp": Degree_cmp, - "Degree_comp": Degree_comp, - "Degree_none": Degree_none, - "Degree_pos": Degree_pos, - "Degree_sup": Degree_sup, - "Degree_abs": Degree_abs, - "Degree_com": Degree_com, - "Degree_dim": Degree_dim, # du - "Degree_equ": Degree_equ, # U20 - "Evident_nfh": Evident_nfh, # U20 - "Gender_com": Gender_com, - "Gender_fem": Gender_fem, - "Gender_masc": Gender_masc, - "Gender_neut": Gender_neut, - "Mood_cnd": Mood_cnd, - "Mood_imp": Mood_imp, - "Mood_ind": Mood_ind, - "Mood_n": Mood_n, - "Mood_pot": Mood_pot, - "Mood_sub": Mood_sub, - "Mood_opt": Mood_opt, - "Mood_prp": Mood_prp, # U20 - "Mood_adm": Mood_adm, # U20 - "Negative_neg": Negative_neg, - "Negative_pos": Negative_pos, - "Negative_yes": Negative_yes, - "Polarity_neg": Polarity_neg, # U20 - "Polarity_pos": Polarity_pos, # U20 - "Number_com": Number_com, - "Number_dual": Number_dual, - "Number_none": Number_none, - "Number_plur": Number_plur, - "Number_sing": Number_sing, - "Number_ptan": Number_ptan, # bg - "Number_count": Number_count, # bg, U20 - "Number_tri": Number_tri, # U20 - "NumType_card": NumType_card, - "NumType_dist": NumType_dist, - "NumType_frac": NumType_frac, - "NumType_gen": NumType_gen, - "NumType_mult": NumType_mult, - "NumType_none": NumType_none, - "NumType_ord": NumType_ord, - "NumType_sets": NumType_sets, - "Person_one": Person_one, - "Person_two": Person_two, - "Person_three": Person_three, - "Person_none": Person_none, - "Poss_yes": Poss_yes, - "PronType_advPart": PronType_advPart, - "PronType_art": PronType_art, - "PronType_default": PronType_default, - "PronType_dem": PronType_dem, - "PronType_ind": PronType_ind, - "PronType_int": PronType_int, - "PronType_neg": PronType_neg, - "PronType_prs": PronType_prs, - "PronType_rcp": PronType_rcp, - "PronType_rel": PronType_rel, - "PronType_tot": PronType_tot, - "PronType_clit": PronType_clit, - "PronType_exc": PronType_exc, # es, ca, it, fa, U20 - "PronType_emp": PronType_emp, # U20 - "Reflex_yes": Reflex_yes, - "Tense_fut": Tense_fut, - "Tense_imp": Tense_imp, - "Tense_past": Tense_past, - "Tense_pres": Tense_pres, - "VerbForm_fin": VerbForm_fin, - "VerbForm_ger": VerbForm_ger, - "VerbForm_inf": VerbForm_inf, - "VerbForm_none": VerbForm_none, - "VerbForm_part": VerbForm_part, - "VerbForm_partFut": VerbForm_partFut, - "VerbForm_partPast": VerbForm_partPast, - "VerbForm_partPres": VerbForm_partPres, - "VerbForm_sup": VerbForm_sup, - "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, # U20 - "VerbForm_gdv": VerbForm_gdv, # la, - "VerbForm_vnoun": VerbForm_vnoun, # U20 - "Voice_act": Voice_act, - "Voice_cau": Voice_cau, - "Voice_pass": Voice_pass, - "Voice_mid": Voice_mid, # gkc, U20 - "Voice_int": Voice_int, # hb, - "Voice_antip": Voice_antip, # U20 - "Voice_dir": Voice_dir, # U20 - "Voice_inv": Voice_inv, # U20 - "Abbr_yes": Abbr_yes, # cz, fi, sl, U, - "AdpType_prep": AdpType_prep, # cz, U, - "AdpType_post": AdpType_post, # U, - "AdpType_voc": AdpType_voc, # cz, - "AdpType_comprep": AdpType_comprep, # cz, - "AdpType_circ": AdpType_circ, # U, - "AdvType_man": AdvType_man, - "AdvType_loc": AdvType_loc, - "AdvType_tim": AdvType_tim, - "AdvType_deg": AdvType_deg, - "AdvType_cau": AdvType_cau, - "AdvType_mod": AdvType_mod, - "AdvType_sta": AdvType_sta, - "AdvType_ex": AdvType_ex, - "AdvType_adadj": AdvType_adadj, - "ConjType_oper": ConjType_oper, # cz, U, - "ConjType_comp": ConjType_comp, # cz, U, - "Connegative_yes": Connegative_yes, # fi, - "Derivation_minen": Derivation_minen, # fi, - "Derivation_sti": Derivation_sti, # fi, - "Derivation_inen": Derivation_inen, # fi, - "Derivation_lainen": Derivation_lainen, # fi, - "Derivation_ja": Derivation_ja, # fi, - "Derivation_ton": Derivation_ton, # fi, - "Derivation_vs": Derivation_vs, # fi, - "Derivation_ttain": Derivation_ttain, # fi, - "Derivation_ttaa": Derivation_ttaa, # fi, - "Echo_rdp": Echo_rdp, # U, - "Echo_ech": Echo_ech, # U, - "Foreign_foreign": Foreign_foreign, # cz, fi, U, - "Foreign_fscript": Foreign_fscript, # cz, fi, U, - "Foreign_tscript": Foreign_tscript, # cz, U, - "Foreign_yes": Foreign_yes, # sl, - "Gender_dat_masc": Gender_dat_masc, # bq, U, - "Gender_dat_fem": Gender_dat_fem, # bq, U, - "Gender_erg_masc": Gender_erg_masc, # bq, - "Gender_erg_fem": Gender_erg_fem, # bq, - "Gender_psor_masc": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut": Gender_psor_neut, # sl, - "Hyph_yes": Hyph_yes, # cz, U, - "InfForm_one": InfForm_one, # fi, - "InfForm_two": InfForm_two, # fi, - "InfForm_three": InfForm_three, # fi, - "NameType_geo": NameType_geo, # U, cz, - "NameType_prs": NameType_prs, # U, cz, - "NameType_giv": NameType_giv, # U, cz, - "NameType_sur": NameType_sur, # U, cz, - "NameType_nat": NameType_nat, # U, cz, - "NameType_com": NameType_com, # U, cz, - "NameType_pro": NameType_pro, # U, cz, - "NameType_oth": NameType_oth, # U, cz, - "NounType_com": NounType_com, # U, - "NounType_prop": NounType_prop, # U, - "NounType_class": NounType_class, # U, - "Number_abs_sing": Number_abs_sing, # bq, U, - "Number_abs_plur": Number_abs_plur, # bq, U, - "Number_dat_sing": Number_dat_sing, # bq, U, - "Number_dat_plur": Number_dat_plur, # bq, U, - "Number_erg_sing": Number_erg_sing, # bq, U, - "Number_erg_plur": Number_erg_plur, # bq, U, - "Number_psee_sing": Number_psee_sing, # U, - "Number_psee_plur": Number_psee_plur, # U, - "Number_psor_sing": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur": Number_psor_plur, # cz, fi, sl, U, - "Number_pauc": Number_pauc, # U20 - "Number_grpa": Number_grpa, # U20 - "Number_grpl": Number_grpl, # U20 - "Number_inv": Number_inv, # U20 - "NumForm_digit": NumForm_digit, # cz, sl, U, - "NumForm_roman": NumForm_roman, # cz, sl, U, - "NumForm_word": NumForm_word, # cz, sl, U, - "NumValue_one": NumValue_one, # cz, U, - "NumValue_two": NumValue_two, # cz, U, - "NumValue_three": NumValue_three, # cz, U, - "PartForm_pres": PartForm_pres, # fi, - "PartForm_past": PartForm_past, # fi, - "PartForm_agt": PartForm_agt, # fi, - "PartForm_neg": PartForm_neg, # fi, - "PartType_mod": PartType_mod, # U, - "PartType_emp": PartType_emp, # U, - "PartType_res": PartType_res, # U, - "PartType_inf": PartType_inf, # U, - "PartType_vbp": PartType_vbp, # U, - "Person_abs_one": Person_abs_one, # bq, U, - "Person_abs_two": Person_abs_two, # bq, U, - "Person_abs_three": Person_abs_three, # bq, U, - "Person_dat_one": Person_dat_one, # bq, U, - "Person_dat_two": Person_dat_two, # bq, U, - "Person_dat_three": Person_dat_three, # bq, U, - "Person_erg_one": Person_erg_one, # bq, U, - "Person_erg_two": Person_erg_two, # bq, U, - "Person_erg_three": Person_erg_three, # bq, U, - "Person_psor_one": Person_psor_one, # fi, U, - "Person_psor_two": Person_psor_two, # fi, U, - "Person_psor_three": Person_psor_three, # fi, U, - "Person_zero": Person_zero, # U20 - "Person_four": Person_four, # U20 - "Polite_inf": Polite_inf, # bq, U, - "Polite_pol": Polite_pol, # bq, U, - "Polite_abs_inf": Polite_abs_inf, # bq, U, - "Polite_abs_pol": Polite_abs_pol, # bq, U, - "Polite_erg_inf": Polite_erg_inf, # bq, U, - "Polite_erg_pol": Polite_erg_pol, # bq, U, - "Polite_dat_inf": Polite_dat_inf, # bq, U, - "Polite_dat_pol": Polite_dat_pol, # bq, U, - "Polite_infm": Polite_infm, # U20 - "Polite_form": Polite_form, # U20 - "Polite_form_elev": Polite_form_elev, # U20 - "Polite_form_humb": Polite_form_humb, # U20 - "Prefix_yes": Prefix_yes, # U, - "PrepCase_npr": PrepCase_npr, # cz, - "PrepCase_pre": PrepCase_pre, # U, - "PunctSide_ini": PunctSide_ini, # U, - "PunctSide_fin": PunctSide_fin, # U, - "PunctType_peri": PunctType_peri, # U, - "PunctType_qest": PunctType_qest, # U, - "PunctType_excl": PunctType_excl, # U, - "PunctType_quot": PunctType_quot, # U, - "PunctType_brck": PunctType_brck, # U, - "PunctType_comm": PunctType_comm, # U, - "PunctType_colo": PunctType_colo, # U, - "PunctType_semi": PunctType_semi, # U, - "PunctType_dash": PunctType_dash, # U, - "Style_arch": Style_arch, # cz, fi, U, - "Style_rare": Style_rare, # cz, fi, U, - "Style_poet": Style_poet, # cz, U, - "Style_norm": Style_norm, # cz, U, - "Style_coll": Style_coll, # cz, U, - "Style_vrnc": Style_vrnc, # cz, U, - "Style_sing": Style_sing, # cz, U, - "Style_expr": Style_expr, # cz, U, - "Style_derg": Style_derg, # cz, U, - "Style_vulg": Style_vulg, # cz, U, - "Style_yes": Style_yes, # fi, U, - "StyleVariant_styleShort": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl, - "VerbType_aux": VerbType_aux, # U, - "VerbType_cop": VerbType_cop, # U, - "VerbType_mod": VerbType_mod, # U, - "VerbType_light": VerbType_light, # U, + "DEPRECATED001": DEPRECATED001, + "DEPRECATED002": DEPRECATED002, + "DEPRECATED003": DEPRECATED003, + "DEPRECATED004": DEPRECATED004, + "DEPRECATED005": DEPRECATED005, + "DEPRECATED006": DEPRECATED006, + "DEPRECATED007": DEPRECATED007, + "DEPRECATED008": DEPRECATED008, + "DEPRECATED009": DEPRECATED009, + "DEPRECATED010": DEPRECATED010, + "DEPRECATED011": DEPRECATED011, + "DEPRECATED012": DEPRECATED012, + "DEPRECATED013": DEPRECATED013, + "DEPRECATED014": DEPRECATED014, + "DEPRECATED015": DEPRECATED015, + "DEPRECATED016": DEPRECATED016, + "DEPRECATED017": DEPRECATED017, + "DEPRECATED018": DEPRECATED018, + "DEPRECATED019": DEPRECATED019, + "DEPRECATED020": DEPRECATED020, + "DEPRECATED021": DEPRECATED021, + "DEPRECATED022": DEPRECATED022, + "DEPRECATED023": DEPRECATED023, + "DEPRECATED024": DEPRECATED024, + "DEPRECATED025": DEPRECATED025, + "DEPRECATED026": DEPRECATED026, + "DEPRECATED027": DEPRECATED027, + "DEPRECATED028": DEPRECATED028, + "DEPRECATED029": DEPRECATED029, + "DEPRECATED030": DEPRECATED030, + "DEPRECATED031": DEPRECATED031, + "DEPRECATED032": DEPRECATED032, + "DEPRECATED033": DEPRECATED033, + "DEPRECATED034": DEPRECATED034, + "DEPRECATED035": DEPRECATED035, + "DEPRECATED036": DEPRECATED036, + "DEPRECATED037": DEPRECATED037, + "DEPRECATED038": DEPRECATED038, + "DEPRECATED039": DEPRECATED039, + "DEPRECATED040": DEPRECATED040, + "DEPRECATED041": DEPRECATED041, + "DEPRECATED042": DEPRECATED042, + "DEPRECATED043": DEPRECATED043, + "DEPRECATED044": DEPRECATED044, + "DEPRECATED045": DEPRECATED045, + "DEPRECATED046": DEPRECATED046, + "DEPRECATED047": DEPRECATED047, + "DEPRECATED048": DEPRECATED048, + "DEPRECATED049": DEPRECATED049, + "DEPRECATED050": DEPRECATED050, + "DEPRECATED051": DEPRECATED051, + "DEPRECATED052": DEPRECATED052, + "DEPRECATED053": DEPRECATED053, + "DEPRECATED054": DEPRECATED054, + "DEPRECATED055": DEPRECATED055, + "DEPRECATED056": DEPRECATED056, + "DEPRECATED057": DEPRECATED057, + "DEPRECATED058": DEPRECATED058, + "DEPRECATED059": DEPRECATED059, + "DEPRECATED060": DEPRECATED060, + "DEPRECATED061": DEPRECATED061, + "DEPRECATED062": DEPRECATED062, + "DEPRECATED063": DEPRECATED063, + "DEPRECATED064": DEPRECATED064, + "DEPRECATED065": DEPRECATED065, + "DEPRECATED066": DEPRECATED066, + "DEPRECATED067": DEPRECATED067, + "DEPRECATED068": DEPRECATED068, + "DEPRECATED069": DEPRECATED069, + "DEPRECATED070": DEPRECATED070, + "DEPRECATED071": DEPRECATED071, + "DEPRECATED072": DEPRECATED072, + "DEPRECATED073": DEPRECATED073, + "DEPRECATED074": DEPRECATED074, + "DEPRECATED075": DEPRECATED075, + "DEPRECATED076": DEPRECATED076, + "DEPRECATED077": DEPRECATED077, + "DEPRECATED078": DEPRECATED078, + "DEPRECATED079": DEPRECATED079, + "DEPRECATED080": DEPRECATED080, + "DEPRECATED081": DEPRECATED081, + "DEPRECATED082": DEPRECATED082, + "DEPRECATED083": DEPRECATED083, + "DEPRECATED084": DEPRECATED084, + "DEPRECATED085": DEPRECATED085, + "DEPRECATED086": DEPRECATED086, + "DEPRECATED087": DEPRECATED087, + "DEPRECATED088": DEPRECATED088, + "DEPRECATED089": DEPRECATED089, + "DEPRECATED090": DEPRECATED090, + "DEPRECATED091": DEPRECATED091, + "DEPRECATED092": DEPRECATED092, + "DEPRECATED093": DEPRECATED093, + "DEPRECATED094": DEPRECATED094, + "DEPRECATED095": DEPRECATED095, + "DEPRECATED096": DEPRECATED096, + "DEPRECATED097": DEPRECATED097, + "DEPRECATED098": DEPRECATED098, + "DEPRECATED099": DEPRECATED099, + "DEPRECATED100": DEPRECATED100, + "DEPRECATED101": DEPRECATED101, + "DEPRECATED102": DEPRECATED102, + "DEPRECATED103": DEPRECATED103, + "DEPRECATED104": DEPRECATED104, + "DEPRECATED105": DEPRECATED105, + "DEPRECATED106": DEPRECATED106, + "DEPRECATED107": DEPRECATED107, + "DEPRECATED108": DEPRECATED108, + "DEPRECATED109": DEPRECATED109, + "DEPRECATED110": DEPRECATED110, + "DEPRECATED111": DEPRECATED111, + "DEPRECATED112": DEPRECATED112, + "DEPRECATED113": DEPRECATED113, + "DEPRECATED114": DEPRECATED114, + "DEPRECATED115": DEPRECATED115, + "DEPRECATED116": DEPRECATED116, + "DEPRECATED117": DEPRECATED117, + "DEPRECATED118": DEPRECATED118, + "DEPRECATED119": DEPRECATED119, + "DEPRECATED120": DEPRECATED120, + "DEPRECATED121": DEPRECATED121, + "DEPRECATED122": DEPRECATED122, + "DEPRECATED123": DEPRECATED123, + "DEPRECATED124": DEPRECATED124, + "DEPRECATED125": DEPRECATED125, + "DEPRECATED126": DEPRECATED126, + "DEPRECATED127": DEPRECATED127, + "DEPRECATED128": DEPRECATED128, + "DEPRECATED129": DEPRECATED129, + "DEPRECATED130": DEPRECATED130, + "DEPRECATED131": DEPRECATED131, + "DEPRECATED132": DEPRECATED132, + "DEPRECATED133": DEPRECATED133, + "DEPRECATED134": DEPRECATED134, + "DEPRECATED135": DEPRECATED135, + "DEPRECATED136": DEPRECATED136, + "DEPRECATED137": DEPRECATED137, + "DEPRECATED138": DEPRECATED138, + "DEPRECATED139": DEPRECATED139, + "DEPRECATED140": DEPRECATED140, + "DEPRECATED141": DEPRECATED141, + "DEPRECATED142": DEPRECATED142, + "DEPRECATED143": DEPRECATED143, + "DEPRECATED144": DEPRECATED144, + "DEPRECATED145": DEPRECATED145, + "DEPRECATED146": DEPRECATED146, + "DEPRECATED147": DEPRECATED147, + "DEPRECATED148": DEPRECATED148, + "DEPRECATED149": DEPRECATED149, + "DEPRECATED150": DEPRECATED150, + "DEPRECATED151": DEPRECATED151, + "DEPRECATED152": DEPRECATED152, + "DEPRECATED153": DEPRECATED153, + "DEPRECATED154": DEPRECATED154, + "DEPRECATED155": DEPRECATED155, + "DEPRECATED156": DEPRECATED156, + "DEPRECATED157": DEPRECATED157, + "DEPRECATED158": DEPRECATED158, + "DEPRECATED159": DEPRECATED159, + "DEPRECATED160": DEPRECATED160, + "DEPRECATED161": DEPRECATED161, + "DEPRECATED162": DEPRECATED162, + "DEPRECATED163": DEPRECATED163, + "DEPRECATED164": DEPRECATED164, + "DEPRECATED165": DEPRECATED165, + "DEPRECATED166": DEPRECATED166, + "DEPRECATED167": DEPRECATED167, + "DEPRECATED168": DEPRECATED168, + "DEPRECATED169": DEPRECATED169, + "DEPRECATED170": DEPRECATED170, + "DEPRECATED171": DEPRECATED171, + "DEPRECATED172": DEPRECATED172, + "DEPRECATED173": DEPRECATED173, + "DEPRECATED174": DEPRECATED174, + "DEPRECATED175": DEPRECATED175, + "DEPRECATED176": DEPRECATED176, + "DEPRECATED177": DEPRECATED177, + "DEPRECATED178": DEPRECATED178, + "DEPRECATED179": DEPRECATED179, + "DEPRECATED180": DEPRECATED180, + "DEPRECATED181": DEPRECATED181, + "DEPRECATED182": DEPRECATED182, + "DEPRECATED183": DEPRECATED183, + "DEPRECATED184": DEPRECATED184, + "DEPRECATED185": DEPRECATED185, + "DEPRECATED186": DEPRECATED186, + "DEPRECATED187": DEPRECATED187, + "DEPRECATED188": DEPRECATED188, + "DEPRECATED189": DEPRECATED189, + "DEPRECATED190": DEPRECATED190, + "DEPRECATED191": DEPRECATED191, + "DEPRECATED192": DEPRECATED192, + "DEPRECATED193": DEPRECATED193, + "DEPRECATED194": DEPRECATED194, + "DEPRECATED195": DEPRECATED195, + "DEPRECATED196": DEPRECATED196, + "DEPRECATED197": DEPRECATED197, + "DEPRECATED198": DEPRECATED198, + "DEPRECATED199": DEPRECATED199, + "DEPRECATED200": DEPRECATED200, + "DEPRECATED201": DEPRECATED201, + "DEPRECATED202": DEPRECATED202, + "DEPRECATED203": DEPRECATED203, + "DEPRECATED204": DEPRECATED204, + "DEPRECATED205": DEPRECATED205, + "DEPRECATED206": DEPRECATED206, + "DEPRECATED207": DEPRECATED207, + "DEPRECATED208": DEPRECATED208, + "DEPRECATED209": DEPRECATED209, + "DEPRECATED210": DEPRECATED210, + "DEPRECATED211": DEPRECATED211, + "DEPRECATED212": DEPRECATED212, + "DEPRECATED213": DEPRECATED213, + "DEPRECATED214": DEPRECATED214, + "DEPRECATED215": DEPRECATED215, + "DEPRECATED216": DEPRECATED216, + "DEPRECATED217": DEPRECATED217, + "DEPRECATED218": DEPRECATED218, + "DEPRECATED219": DEPRECATED219, + "DEPRECATED220": DEPRECATED220, + "DEPRECATED221": DEPRECATED221, + "DEPRECATED222": DEPRECATED222, + "DEPRECATED223": DEPRECATED223, + "DEPRECATED224": DEPRECATED224, + "DEPRECATED225": DEPRECATED225, + "DEPRECATED226": DEPRECATED226, + "DEPRECATED227": DEPRECATED227, + "DEPRECATED228": DEPRECATED228, + "DEPRECATED229": DEPRECATED229, + "DEPRECATED230": DEPRECATED230, + "DEPRECATED231": DEPRECATED231, + "DEPRECATED232": DEPRECATED232, + "DEPRECATED233": DEPRECATED233, + "DEPRECATED234": DEPRECATED234, + "DEPRECATED235": DEPRECATED235, + "DEPRECATED236": DEPRECATED236, + "DEPRECATED237": DEPRECATED237, + "DEPRECATED238": DEPRECATED238, + "DEPRECATED239": DEPRECATED239, + "DEPRECATED240": DEPRECATED240, + "DEPRECATED241": DEPRECATED241, + "DEPRECATED242": DEPRECATED242, + "DEPRECATED243": DEPRECATED243, + "DEPRECATED244": DEPRECATED244, + "DEPRECATED245": DEPRECATED245, + "DEPRECATED246": DEPRECATED246, + "DEPRECATED247": DEPRECATED247, + "DEPRECATED248": DEPRECATED248, + "DEPRECATED249": DEPRECATED249, + "DEPRECATED250": DEPRECATED250, + "DEPRECATED251": DEPRECATED251, + "DEPRECATED252": DEPRECATED252, + "DEPRECATED253": DEPRECATED253, + "DEPRECATED254": DEPRECATED254, + "DEPRECATED255": DEPRECATED255, + "DEPRECATED256": DEPRECATED256, + "DEPRECATED257": DEPRECATED257, + "DEPRECATED258": DEPRECATED258, + "DEPRECATED259": DEPRECATED259, + "DEPRECATED260": DEPRECATED260, + "DEPRECATED261": DEPRECATED261, + "DEPRECATED262": DEPRECATED262, + "DEPRECATED263": DEPRECATED263, + "DEPRECATED264": DEPRECATED264, + "DEPRECATED265": DEPRECATED265, + "DEPRECATED266": DEPRECATED266, + "DEPRECATED267": DEPRECATED267, + "DEPRECATED268": DEPRECATED268, + "DEPRECATED269": DEPRECATED269, + "DEPRECATED270": DEPRECATED270, + "DEPRECATED271": DEPRECATED271, + "DEPRECATED272": DEPRECATED272, + "DEPRECATED273": DEPRECATED273, + "DEPRECATED274": DEPRECATED274, + "DEPRECATED275": DEPRECATED275, + "DEPRECATED276": DEPRECATED276, "PERSON": PERSON, "NORP": NORP, @@ -467,6 +463,7 @@ IDS = { "acl": acl, "LAW": LAW, + "MORPH": MORPH, } diff --git a/spacy/syntax/_beam_utils.pxd b/spacy/syntax/_beam_utils.pxd index 36b0c05da..cf99ac3d1 100644 --- a/spacy/syntax/_beam_utils.pxd +++ b/spacy/syntax/_beam_utils.pxd @@ -1,4 +1,4 @@ -from thinc.typedefs cimport class_t, hash_t +from ..typedefs cimport hash_t, class_t # These are passed as callbacks to thinc.search.Beam cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1 diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index b1085c762..32cf9193a 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -5,9 +5,9 @@ import numpy from cpython.ref cimport PyObject, Py_XDECREF from thinc.extra.search cimport Beam from thinc.extra.search import MaxViolation -from thinc.typedefs cimport hash_t, class_t from thinc.extra.search cimport MaxViolation +from ..typedefs cimport hash_t, class_t from .transition_system cimport TransitionSystem, Transition from ..gold cimport GoldParse from ..errors import Errors diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd index 9c72f3415..15befb372 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/syntax/_parser_model.pxd @@ -1,6 +1,6 @@ from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc -from thinc.typedefs cimport weight_t, class_t, hash_t +from ..typedefs cimport weight_t, class_t, hash_t from ._state cimport StateC diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 8b6448a46..442233f19 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -1,10 +1,6 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -# coding: utf-8 -from __future__ import unicode_literals, print_function - -from collections import OrderedDict import numpy cimport cython.parallel import numpy.random @@ -14,18 +10,12 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.extra.search cimport Beam -from thinc.api import chain, clone -from thinc.v2v import Model, Maxout, Affine -from thinc.misc import LayerNorm -from thinc.neural.ops import CupyOps, NumpyOps -from thinc.neural.util import get_array_module -from thinc.linalg cimport Vec, VecVec +from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops +from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy -from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten -from .._ml import link_vectors_to_models, create_default_optimizer +from ..typedefs cimport weight_t, class_t, hash_t from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse @@ -35,6 +25,7 @@ from .stateclass cimport StateClass from .transition_system cimport Transition from . import _beam_utils from . import nonproj +from ..util import link_vectors_to_models, create_default_optimizer cdef WeightsC get_c_weights(model) except *: @@ -48,8 +39,8 @@ cdef WeightsC get_c_weights(model) except *: output.hidden_weights = NULL output.hidden_bias = NULL else: - vec2scores_W = model.vec2scores.W - vec2scores_b = model.vec2scores.b + vec2scores_W = model.vec2scores.get_param("W") + vec2scores_b = model.vec2scores.get_param("b") output.hidden_weights = vec2scores_W.data output.hidden_bias = vec2scores_b.data cdef np.ndarray class_mask = model._class_mask @@ -61,12 +52,12 @@ cdef SizesC get_c_sizes(model, int batch_size) except *: cdef SizesC output output.states = batch_size if model.vec2scores is None: - output.classes = model.state2vec.nO + output.classes = model.state2vec.get_dim("nO") else: - output.classes = model.vec2scores.nO - output.hiddens = model.state2vec.nO - output.pieces = model.state2vec.nP - output.feats = model.state2vec.nF + output.classes = model.vec2scores.get_dim("nO") + output.hiddens = model.state2vec.get_dim("nO") + output.pieces = model.state2vec.get_dim("nP") + output.feats = model.state2vec.get_dim("nF") output.embed_width = model.tokvecs.shape[1] return output @@ -230,7 +221,7 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserModel(Model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): - Model.__init__(self) + Model.__init__(self, name="parser_model", forward=forward) self._layers = [tok2vec, lower_model] if upper_model is not None: self._layers.append(upper_model) @@ -239,41 +230,47 @@ class ParserModel(Model): for class_ in unseen_classes: self.unseen_classes.add(class_) - def begin_update(self, docs, drop=0.): - step_model = ParserStepModel(docs, self._layers, drop=drop, - unseen_classes=self.unseen_classes) - def finish_parser_update(golds, sgd=None): - step_model.make_updates(sgd) - return None - return step_model, finish_parser_update + def predict(self, docs): + step_model = ParserStepModel(docs, self._layers, + unseen_classes=self.unseen_classes, train=False) + return step_model - def resize_output(self, new_output): + def resize_output(self, new_nO): if len(self._layers) == 2: - return - if new_output == self.upper.nO: + return + if new_nO == self.upper.get_dim("nO"): return smaller = self.upper - - with Model.use_device('cpu'): - larger = Affine(new_output, smaller.nI) - larger.W.fill(0.0) - larger.b.fill(0.0) - # It seems very unhappy if I pass these as smaller.W? - # Seems to segfault. Maybe it's a descriptor protocol thing? - smaller_W = smaller.W - larger_W = larger.W - smaller_b = smaller.b - larger_b = larger.b + nI = smaller.get_dim("nI") + with use_ops('numpy'): + larger = Linear(new_nO, nI) + larger_W = larger.ops.alloc2f(new_nO, nI) + larger_b = larger.ops.alloc1f(new_nO) + smaller_W = smaller.get_param("W") + smaller_b = smaller.get_param("b") # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. - larger_W[:smaller.nO] = smaller_W - larger_b[:smaller.nO] = smaller_b + larger_W[:smaller.get_dim("nO")] = smaller_W + larger_b[:smaller.get_dim("nO")] = smaller_b + larger.set_param("W", larger_W) + larger.set_param("b", larger_b) self._layers[-1] = larger - for i in range(smaller.nO, new_output): + for i in range(smaller.get_dim("nO"), new_nO): self.unseen_classes.add(i) - def begin_training(self, X, y=None): - self.lower.begin_training(X, y=y) + def initialize(self, X=None, Y=None): + self.tok2vec.initialize() + self.lower.initialize(X=X, Y=Y) + if self.upper is not None: + # In case we need to trigger the callbacks + statevecs = self.ops.alloc((2, self.lower.get_dim("nO"))) + self.upper.initialize(X=statevecs) + + def finish_update(self, optimizer): + self.tok2vec.finish_update(optimizer) + self.lower.finish_update(optimizer) + if self.upper is not None: + self.upper.finish_update(optimizer) @property def tok2vec(self): @@ -288,17 +285,25 @@ class ParserModel(Model): return self._layers[2] +def forward(model:ParserModel, X, is_train): + step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes, + train=is_train) + + return step_model, step_model.finish_steps + + class ParserStepModel(Model): - def __init__(self, docs, layers, unseen_classes=None, drop=0.): - self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop) - if layers[1].nP >= 2: + def __init__(self, docs, layers, unseen_classes=None, train=True): + Model.__init__(self, name="parser_step_model", forward=step_forward) + self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) + if layers[1].get_dim("nP") >= 2: activation = "maxout" elif len(layers) == 2: activation = None else: activation = "relu" self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], - activation=activation, drop=drop) + activation=activation, train=train) if len(layers) == 3: self.vec2scores = layers[-1] else: @@ -308,7 +313,7 @@ class ParserStepModel(Model): if self.vec2scores is None: self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f') else: - self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f') + self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f') self._class_mask.fill(1) if unseen_classes is not None: for class_ in unseen_classes: @@ -327,40 +332,6 @@ class ParserStepModel(Model): def mark_class_seen(self, class_): self._class_mask[class_] = 1 - def begin_update(self, states, drop=0.): - token_ids = self.get_token_ids(states) - vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) - if self.vec2scores is not None: - mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) - if mask is not None: - vector *= mask - scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) - else: - scores = NumpyOps().asarray(vector) - get_d_vector = lambda d_scores, sgd=None: d_scores - mask = None - # If the class is unseen, make sure its score is minimum - scores[:, self._class_mask == 0] = numpy.nanmin(scores) - - def backprop_parser_step(d_scores, sgd=None): - # Zero vectors for unseen classes - d_scores *= self._class_mask - d_vector = get_d_vector(d_scores, sgd=sgd) - if mask is not None: - d_vector *= mask - if isinstance(self.state2vec.ops, CupyOps) \ - and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - self.backprops.append(( - util.get_async(self.cuda_stream, token_ids), - util.get_async(self.cuda_stream, d_vector), - get_d_tokvecs - )) - else: - self.backprops.append((token_ids, d_vector, get_d_tokvecs)) - return None - return scores, backprop_parser_step - def get_token_ids(self, batch): states = _beam_utils.collect_states(batch) cdef StateClass state @@ -374,25 +345,56 @@ class ParserStepModel(Model): c_ids += ids.shape[1] return ids - def make_updates(self, sgd): + def finish_steps(self, golds): # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. - d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) + d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) # Tells CUDA to block, so our async copies complete. if self.cuda_stream is not None: self.cuda_stream.synchronize() for ids, d_vector, bp_vector in self.backprops: - d_state_features = bp_vector((d_vector, ids), sgd=sgd) + d_state_features = bp_vector((d_vector, ids)) ids = ids.flatten() d_state_features = d_state_features.reshape( (ids.size, d_state_features.shape[2])) self.ops.scatter_add(d_tokvecs, ids, d_state_features) # Padded -- see update() - self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd) + if isinstance(self.ops, CupyOps): + d_tokvecs = self.ops.to_numpy(d_tokvecs) + self.bp_tokvecs(d_tokvecs[:-1]) return d_tokvecs +def step_forward(model: ParserStepModel, states, is_train): + token_ids = model.get_token_ids(states) + vector, get_d_tokvecs = model.state2vec(token_ids, is_train) + if model.vec2scores is not None: + scores, get_d_vector = model.vec2scores(vector, is_train) + else: + scores = NumpyOps().asarray(vector) + get_d_vector = lambda d_scores: d_scores + # If the class is unseen, make sure its score is minimum + scores[:, model._class_mask == 0] = numpy.nanmin(scores) + + def backprop_parser_step(d_scores): + # Zero vectors for unseen classes + d_scores *= model._class_mask + d_vector = get_d_vector(d_scores) + if isinstance(model.state2vec.ops, CupyOps) \ + and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): + # Move token_ids and d_vector to GPU, asynchronously + model.backprops.append(( + util.get_async(model.cuda_stream, token_ids), + util.get_async(model.cuda_stream, d_vector), + get_d_tokvecs + )) + else: + model.backprops.append((token_ids, d_vector, get_d_tokvecs)) + return None + return scores, backprop_parser_step + + cdef class precompute_hiddens: """Allow a model to be "primed" by pre-computing input features in bulk. @@ -410,7 +412,7 @@ cdef class precompute_hiddens: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. """ - cdef readonly int nF, nO, nP + cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc cdef bint _is_synchronized cdef public object ops cdef np.ndarray _features @@ -421,8 +423,8 @@ cdef class precompute_hiddens: cdef object activation def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - activation="maxout", drop=0.): - gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) + activation="maxout", train=False): + gpu_cached, bp_features = lower_model(tokvecs, train) cdef np.ndarray cached if not isinstance(gpu_cached, numpy.ndarray): # Note the passing of cuda_stream here: it lets @@ -431,12 +433,16 @@ cdef class precompute_hiddens: cached = gpu_cached.get(stream=cuda_stream) else: cached = gpu_cached - if not isinstance(lower_model.b, numpy.ndarray): - self.bias = lower_model.b.get() + if not isinstance(lower_model.get_param("b"), numpy.ndarray): + # self.bias = lower_model.get_param("b").get(stream=cuda_stream) ??? + self.bias = lower_model.get_param("b") else: - self.bias = lower_model.b + self.bias = lower_model.get_param("b") self.nF = cached.shape[1] - self.nP = getattr(lower_model, 'nP', 1) + if lower_model.has_dim("nP"): + self.nP = lower_model.get_dim("nP") + else: + self.nP = 1 self.nO = cached.shape[2] self.ops = lower_model.ops assert activation in (None, "relu", "maxout") @@ -452,10 +458,26 @@ cdef class precompute_hiddens: self._is_synchronized = True return self._cached.data - def __call__(self, X): - return self.begin_update(X, drop=None)[0] + def get_dim(self, name): + if name == "nF": + return self.nF + elif name == "nP": + return self.nP + elif name == "nO": + return self.nO + else: + raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") - def begin_update(self, token_ids, drop=0.): + def __call__(self, X, bint is_train): + if is_train: + return self.begin_update(X) + else: + return self.predict(X), lambda X: X + + def predict(self, X): + return self.begin_update(X)[0] + + def begin_update(self, token_ids): cdef np.ndarray state_vector = numpy.zeros( (token_ids.shape[0], self.nO, self.nP), dtype='f') # This is tricky, but (assuming GPU available); @@ -470,13 +492,13 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector += self.bias + state_vector = state_vector + self.bias state_vector, bp_nonlinearity = self._nonlinearity(state_vector) - def backward(d_state_vector_ids, sgd=None): + def backward(d_state_vector_ids): d_state_vector, token_ids = d_state_vector_ids - d_state_vector = bp_nonlinearity(d_state_vector, sgd) - d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) + d_state_vector = bp_nonlinearity(d_state_vector) + d_tokens = bp_hiddens((d_state_vector, token_ids)) return d_tokens return state_vector, backward @@ -485,7 +507,7 @@ cdef class precompute_hiddens: ops = NumpyOps() else: ops = CupyOps() - + if self.activation == "maxout": state_vector, mask = ops.maxout(state_vector) else: @@ -496,7 +518,7 @@ cdef class precompute_hiddens: else: mask = None - def backprop_nonlinearity(d_best, sgd=None): + def backprop_nonlinearity(d_best): if isinstance(d_best, numpy.ndarray): ops = NumpyOps() else: diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 972ad682a..9e9593eee 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -1,6 +1,6 @@ from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t +from ..typedefs cimport weight_t from .stateclass cimport StateClass from ..typedefs cimport attr_t diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index efe8573c1..5ec169428 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,12 +1,9 @@ # cython: profile=True # cython: cdivision=True # cython: infer_types=True -# coding: utf-8 -from __future__ import unicode_literals - from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from collections import OrderedDict, defaultdict, Counter +from collections import defaultdict, Counter from thinc.extra.search cimport Beam import json @@ -25,7 +22,7 @@ from ..tokens.doc cimport Doc, set_children_from_heads # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 cdef weight_t MIN_SCORE = -90000 -cdef attr_t SUBTOK_LABEL = hash_string('subtok') +cdef attr_t SUBTOK_LABEL = hash_string(u'subtok') DEF NON_MONOTONIC = True DEF USE_BREAK = True @@ -347,20 +344,20 @@ cdef class ArcEager(TransitionSystem): for label in kwargs.get('right_labels', []): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 - for raw_text, sents in kwargs.get('gold_parses', []): - for (ids, words, tags, heads, labels, iob), ctnts in sents: - heads, labels = nonproj.projectivize(heads, labels) - for child, head, label in zip(ids, heads, labels): - if label.upper() == 'ROOT' : - label = 'ROOT' - if head == child: - actions[BREAK][label] += 1 - elif head < child: - actions[RIGHT][label] += 1 - actions[REDUCE][''] += 1 - elif head > child: - actions[LEFT][label] += 1 - actions[SHIFT][''] += 1 + for example in kwargs.get('gold_parses', []): + heads, labels = nonproj.projectivize(example.token_annotation.heads, + example.token_annotation.deps) + for child, head, label in zip(example.token_annotation.ids, heads, labels): + if label.upper() == 'ROOT' : + label = 'ROOT' + if head == child: + actions[BREAK][label] += 1 + elif head < child: + actions[RIGHT][label] += 1 + actions[REDUCE][''] += 1 + elif head > child: + actions[LEFT][label] += 1 + actions[SHIFT][''] += 1 if min_freq is not None: for action, label_freqs in actions.items(): for label, freq in list(label_freqs.items()): @@ -403,7 +400,9 @@ cdef class ArcEager(TransitionSystem): self.strings[state.safe_get(i).dep])) else: predicted.add((i, state.H(i), 'ROOT')) - id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] + id_ = gold.orig.ids[gold.cand_to_gold[i]] + head = gold.orig.heads[gold.cand_to_gold[i]] + dep = gold.orig.deps[gold.cand_to_gold[i]] truth.add((id_, head, dep)) return truth == predicted diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 9f8ad418c..50b916fe2 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,10 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam -from collections import OrderedDict, Counter +from collections import Counter +from ..typedefs cimport weight_t from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition @@ -72,13 +69,12 @@ cdef class BiluoPushDown(TransitionSystem): for action in (BEGIN, IN, LAST, UNIT): actions[action][entity_type] = 1 moves = ('M', 'B', 'I', 'L', 'U') - for raw_text, sents in kwargs.get('gold_parses', []): - for (ids, words, tags, heads, labels, biluo), _ in sents: - for i, ner_tag in enumerate(biluo): - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) - for action in (BEGIN, IN, LAST, UNIT): - actions[action][label] += 1 + for example in kwargs.get('gold_parses', []): + for i, ner_tag in enumerate(example.token_annotation.entities): + if ner_tag != 'O' and ner_tag != '-': + _, label = ner_tag.split('-', 1) + for action in (BEGIN, IN, LAST, UNIT): + actions[action][label] += 1 return actions @property diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 707c9654c..d77a04420 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -1,5 +1,3 @@ -from thinc.typedefs cimport atom_t - from .stateclass cimport StateClass from .arc_eager cimport TransitionSystem from ..vocab cimport Vocab diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 153ca67cd..cf57e1cf6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,15 +1,8 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -# coding: utf-8 -from __future__ import unicode_literals, print_function - -from collections import OrderedDict -import numpy cimport cython.parallel -import numpy.random cimport numpy as np -from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -17,23 +10,24 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.extra.search cimport Beam -from thinc.api import chain, clone -from thinc.v2v import Model, Maxout, Affine -from thinc.misc import LayerNorm -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module -from thinc.linalg cimport Vec, VecVec -import srsly +from thinc.backends.linalg cimport Vec, VecVec +from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops +from thinc.api import get_array_module, zero_init, set_dropout_rate +from itertools import islice +import srsly +import numpy.random +import numpy + +from ..gold import Example +from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model import ParserModel -from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten -from .._ml import link_vectors_to_models, create_default_optimizer +from ..util import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse @@ -47,6 +41,10 @@ from . import _beam_utils from . import nonproj +from ..ml._layers import PrecomputableAffine +from ..ml.component_models import Tok2Vec + + cdef class Parser: """ Base class of the DependencyParser and EntityRecognizer. @@ -74,23 +72,23 @@ cdef class Parser: parser_maxout_pieces = 1 embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) pretrained_vectors = cfg.get('pretrained_vectors', None) - tok2vec = Tok2Vec(token_vector_width, embed_size, + tok2vec = Tok2Vec(width=token_vector_width, + embed_size=embed_size, conv_depth=conv_depth, - conv_window=conv_window, + window_size=conv_window, cnn_maxout_pieces=t2v_pieces, subword_features=subword_features, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth) - tok2vec = chain(tok2vec, flatten) - tok2vec.nO = token_vector_width + tok2vec = chain(tok2vec, list2array()) + tok2vec.set_dim("nO", token_vector_width) lower = PrecomputableAffine(hidden_width, nF=nr_feature_tokens, nI=token_vector_width, nP=parser_maxout_pieces) - lower.nP = parser_maxout_pieces + lower.set_dim("nP", parser_maxout_pieces) if depth == 1: - with Model.use_device('cpu'): - upper = Affine(nr_class, hidden_width, drop_factor=0.0) - upper.W *= 0 + with use_ops('numpy'): + upper = Linear(nr_class, hidden_width, init_W=zero_init) else: upper = None @@ -105,11 +103,13 @@ cdef class Parser: 'bilstm_depth': bilstm_depth, 'self_attn_depth': self_attn_depth, 'conv_depth': conv_depth, - 'conv_window': conv_window, + 'window_size': conv_window, 'embed_size': embed_size, 'cnn_maxout_pieces': t2v_pieces } - return ParserModel(tok2vec, lower, upper), cfg + model = ParserModel(tok2vec, lower, upper) + model.initialize() + return model, cfg name = 'base_parser' @@ -201,7 +201,7 @@ cdef class Parser: # Defined in subclasses, to avoid circular import raise NotImplementedError - def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg): + def init_multitask_objectives(self, get_examples, pipeline, **cfg): '''Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. @@ -211,9 +211,9 @@ cdef class Parser: ''' pass - def preprocess_gold(self, docs_golds): - for doc, gold in docs_golds: - yield doc, gold + def preprocess_gold(self, examples): + for ex in examples: + yield ex def use_params(self, params): # Can't decorate cdef class :(. Workaround. @@ -234,7 +234,8 @@ cdef class Parser: self.set_annotations([doc], states, tensors=None) return doc - def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None): + def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None, + as_example=False): """Process a stream of documents. stream: The sequence of documents to process. @@ -247,14 +248,21 @@ cdef class Parser: cdef Doc doc for batch in util.minibatch(docs, size=batch_size): batch_in_order = list(batch) - by_length = sorted(batch_in_order, key=lambda doc: len(doc)) + docs = [self._get_doc(ex) for ex in batch_in_order] + by_length = sorted(docs, key=lambda doc: len(doc)) for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): subbatch = list(subbatch) parse_states = self.predict(subbatch, beam_width=beam_width, beam_density=beam_density) self.set_annotations(subbatch, parse_states, tensors=None) - for doc in batch_in_order: - yield doc + if as_example: + annotated_examples = [] + for ex, doc in zip(batch_in_order, docs): + ex.doc = doc + annotated_examples.append(ex) + yield from annotated_examples + else: + yield from batch_in_order def require_model(self): """Raise an error if the component's model is not initialized.""" @@ -278,12 +286,13 @@ cdef class Parser: def greedy_parse(self, docs, drop=0.): cdef vector[StateC*] states cdef StateClass state + set_dropout_rate(self.model, drop) batch = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() - model = self.model(docs) + model = self.model.predict(docs) weights = get_c_weights(model) for state in batch: if not state.is_final(): @@ -298,18 +307,19 @@ cdef class Parser: cdef Beam beam cdef Doc doc cdef np.ndarray token_ids + set_dropout_rate(self.model, drop) beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() - model = self.model(docs) + model = self.model.predict(docs) token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), dtype='i', order='C') cdef int* c_ids cdef int nr_feature = self.cfg["nr_feature_tokens"] cdef int n_states - model = self.model(docs) + model = self.model.predict(docs) todo = [beam for beam in beams if not beam.is_done] while todo: token_ids.fill(-1) @@ -326,8 +336,8 @@ cdef class Parser: n_states += 1 if n_states == 0: break - vectors = model.state2vec(token_ids[:n_states]) - scores = model.vec2scores(vectors) + vectors = model.state2vec.predict(token_ids[:n_states]) + scores = model.vec2scores.predict(vectors) todo = self.transition_beams(todo, scores) return beams @@ -419,75 +429,80 @@ cdef class Parser: beam.check_done(_beam_utils.check_final_state, NULL) return [b for b in beams if not b.is_done] - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): self.require_model() - if isinstance(docs, Doc) and isinstance(golds, GoldParse): - docs = [docs] - golds = [golds] - if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value='update', n_docs=len(docs), - n_golds=len(golds))) + examples = Example.to_example_objects(examples) + if losses is None: losses = {} losses.setdefault(self.name, 0.) for multitask in self._multitasks: - multitask.update(docs, golds, drop=drop, sgd=sgd) + multitask.update(examples, drop=drop, sgd=sgd) # The probability we use beam update, instead of falling back to # a greedy update beam_update_prob = self.cfg.get('beam_update_prob', 0.5) if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam(docs, golds, self.cfg.get('beam_width', 1), - drop=drop, sgd=sgd, losses=losses, + return self.update_beam(examples, self.cfg.get('beam_width', 1), + drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations, beam_density=self.cfg.get('beam_density', 0.001)) + + set_dropout_rate(self.model, drop) # Chop sequences into lengths of this many transitions, to make the # batch uniform length. cut_gold = numpy.random.choice(range(20, 100)) - states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold) + states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold) states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] # Prepare the stepwise model, and get the callback for finishing the batch - model, finish_update = self.model.begin_update(docs, drop=drop) + model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples]) + all_states = list(states) for _ in range(max_steps): if not states_golds: break states, golds = zip(*states_golds) - scores, backprop = model.begin_update(states, drop=drop) + scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) - backprop(d_scores, sgd=sgd) + backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) states_golds = [eg for eg in states_golds if not eg[0].is_final()] - # Do the backprop - finish_update(golds, sgd=sgd) + backprop_tok2vec(golds) + if sgd is not None: + self.model.finish_update(sgd) + if set_annotations: + docs = [ex.doc for ex in examples] + self.set_annotations(docs, all_states) return losses - def rehearse(self, docs, sgd=None, losses=None, **cfg): + def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" - if isinstance(docs, Doc): - docs = [docs] + examples = Example.to_example_objects(examples) if losses is None: losses = {} for multitask in self._multitasks: if hasattr(multitask, 'rehearse'): - multitask.rehearse(docs, losses=losses, sgd=sgd) + multitask.rehearse(examples, losses=losses, sgd=sgd) if self._rehearsal_model is None: return None losses.setdefault(self.name, 0.) + docs = [ex.doc for ex in examples] states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() # Prepare the stepwise model, and get the callback for finishing the batch - tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0) - model, finish_update = self.model.begin_update(docs, drop=0.0) + set_dropout_rate(self._rehearsal_model, 0.0) + set_dropout_rate(self.model, 0.0) + tutor, _ = self._rehearsal_model.begin_update(docs) + model, finish_update = self.model.begin_update(docs) n_scores = 0. loss = 0. while states: - targets, _ = tutor.begin_update(states, drop=0.) - guesses, backprop = model.begin_update(states, drop=0.) + targets, _ = tutor.begin_update(states) + guesses, backprop = model.begin_update(states) d_scores = (guesses - targets) / targets.shape[0] # If all weights for an output are 0 in the original model, don't # supervise that output. This allows us to add classes. @@ -498,25 +513,33 @@ cdef class Parser: states = [state for state in states if not state.is_final()] n_scores += d_scores.size # Do the backprop - finish_update(docs, sgd=sgd) + finish_update(docs) + if sgd is not None: + self.model.finish_update(sgd) losses[self.name] += loss / n_scores return losses - def update_beam(self, docs, golds, width, drop=0., sgd=None, losses=None, - beam_density=0.0): + def update_beam(self, examples, width, drop=0., sgd=None, losses=None, + set_annotations=False, beam_density=0.0): + examples = Example.to_example_objects(examples) + docs = [ex.doc for ex in examples] + golds = [ex.gold for ex in examples] + new_golds = [] lengths = [len(d) for d in docs] states = self.moves.init_batch(docs) for gold in golds: self.moves.preprocess_gold(gold) - model, finish_update = self.model.begin_update(docs, drop=drop) + new_golds.append(gold) + set_dropout_rate(self.model, drop) + model, backprop_tok2vec = self.model.begin_update(docs) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec, - model.vec2scores, width, drop=drop, losses=losses, + self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, + model.state2vec, model.vec2scores, width, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): losses[self.name] += (d_scores**2).mean() ids, bp_vectors, bp_scores = backprops[i] - d_vector = bp_scores(d_scores, sgd=sgd) + d_vector = bp_scores(d_scores) if isinstance(model.ops, CupyOps) \ and not isinstance(ids, model.state2vec.ops.xp.ndarray): model.backprops.append(( @@ -525,12 +548,35 @@ cdef class Parser: bp_vectors)) else: model.backprops.append((ids, d_vector, bp_vectors)) - model.make_updates(sgd) + backprop_tok2vec(golds) + if sgd is not None: + self.model.finish_update(sgd) + if set_annotations: + self.set_annotations(docs, beams) cdef Beam beam for beam in beams: _beam_utils.cleanup_beam(beam) - def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500): + def get_gradients(self): + """Get non-zero gradients of the model's parameters, as a dictionary + keyed by the parameter ID. The values are (weights, gradients) tuples. + """ + gradients = {} + if self.model in (None, True, False): + return gradients + queue = [self.model] + seen = set() + for node in queue: + if node.id in seen: + continue + seen.add(node.id) + if hasattr(node, "_mem") and node._mem.gradient.any(): + gradients[node.id] = [node._mem.weights, node._mem.gradient] + if hasattr(node, "_layers"): + queue.extend(node._layers) + return gradients + + def _init_gold_batch(self, whole_examples, min_length=5, max_length=500): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, where N is the shortest doc. We'll make two states, one representing @@ -538,6 +584,8 @@ cdef class Parser: cdef: StateClass state Transition action + whole_docs = [ex.doc for ex in whole_examples] + whole_golds = [ex.gold for ex in whole_examples] whole_states = self.moves.init_batch(whole_docs) max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs]))) max_moves = 0 @@ -597,17 +645,16 @@ cdef class Parser: return d_scores def create_optimizer(self): - return create_default_optimizer(self.model.ops, - **self.cfg.get('optimizer', {})) + return create_default_optimizer() - def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg): if 'model' in cfg: self.model = cfg['model'] - if not hasattr(get_gold_tuples, '__call__'): - gold_tuples = get_gold_tuples - get_gold_tuples = lambda: gold_tuples + if not hasattr(get_examples, '__call__'): + gold_tuples = get_examples + get_examples = lambda: gold_tuples cfg.setdefault('min_action_freq', 30) - actions = self.moves.get_actions(gold_parses=get_gold_tuples(), + actions = self.moves.get_actions(gold_parses=get_examples(), min_freq=cfg.get('min_action_freq', 30), learn_tokens=self.cfg.get("learn_tokens", False)) for action, labels in self.moves.labels.items(): @@ -623,23 +670,30 @@ cdef class Parser: sgd = self.create_optimizer() doc_sample = [] gold_sample = [] - for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): - for annots, brackets in annots_brackets: - ids, words, tags, heads, deps, ents = annots - doc_sample.append(Doc(self.vocab, words=words)) - gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(doc_sample, gold_sample) + for example in islice(get_examples(), 1000): + parses = example.get_gold_parses(merge=False, vocab=self.vocab) + for doc, gold in parses: + doc_sample.append(doc) + gold_sample.append(gold) + self.model.initialize(doc_sample, gold_sample) if pipeline is not None: - self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) + self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) else: if sgd is None: sgd = self.create_optimizer() - self.model.begin_training([]) + if self.model.upper.has_dim("nO") is None: + self.model.upper.set_dim("nO", self.moves.n_moves) + self.model.initialize() self.cfg.update(cfg) return sgd + def _get_doc(self, example): + """ Use this method if the `example` can be both a Doc or an Example """ + if isinstance(example, Doc): + return example + return example.doc + def to_disk(self, path, exclude=tuple(), **kwargs): serializers = { 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), @@ -675,28 +729,28 @@ cdef class Parser: return self def to_bytes(self, exclude=tuple(), **kwargs): - serializers = OrderedDict(( - ('model', lambda: (self.model.to_bytes() if self.model is not True else True)), - ('vocab', lambda: self.vocab.to_bytes()), - ('moves', lambda: self.moves.to_bytes(exclude=["strings"])), - ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)) - )) + serializers = { + "model": lambda: (self.model.to_bytes() if self.model is not True else True), + "vocab": lambda: self.vocab.to_bytes(), + "moves": lambda: self.moves.to_bytes(exclude=["strings"]), + "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) + } exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): - deserializers = OrderedDict(( - ('vocab', lambda b: self.vocab.from_bytes(b)), - ('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])), - ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), - ('model', lambda b: None) - )) + deserializers = { + "vocab": lambda b: self.vocab.from_bytes(b), + "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: None + } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: # TODO: Remove this once we don't have to handle previous models if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors.name + self.cfg['pretrained_vectors'] = self.vocab.vectors if self.model is True: self.model, cfg = self.Model(**self.cfg) else: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 53e8a9cfe..27516ffd9 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,14 +1,12 @@ -# coding: utf-8 # cython: profile=True # cython: infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ -from __future__ import unicode_literals - from copy import copy +from ..gold import Example from ..tokens.doc cimport Doc, set_children_from_heads from ..errors import Errors @@ -77,39 +75,41 @@ def decompose(label): def is_decorated(label): return DELIMITER in label -def count_decorated_labels(gold_tuples): +def count_decorated_labels(gold_data): freqs = {} - for raw_text, sents in gold_tuples: - for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads, deco_labels = projectivize(heads, labels) - # set the label to ROOT for each root dependent - deco_labels = ['ROOT' if head == i else deco_labels[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - for label in deco_labels: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 + for example in gold_data: + proj_heads, deco_deps = projectivize(example.token_annotation.heads, + example.token_annotation.deps) + # set the label to ROOT for each root dependent + deco_deps = ['ROOT' if head == i else deco_deps[i] + for i, head in enumerate(proj_heads)] + # count label frequencies + for label in deco_deps: + if is_decorated(label): + freqs[label] = freqs.get(label, 0) + 1 return freqs -def preprocess_training_data(gold_tuples, label_freq_cutoff=30): +def preprocess_training_data(gold_data, label_freq_cutoff=30): preprocessed = [] freqs = {} - for raw_text, sents in gold_tuples: - prepro_sents = [] - for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads, deco_labels = projectivize(heads, labels) - # set the label to ROOT for each root dependent - deco_labels = ['ROOT' if head == i else deco_labels[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - if label_freq_cutoff > 0: - for label in deco_labels: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 - prepro_sents.append( - ((ids, words, tags, proj_heads, deco_labels, iob), ctnts)) - preprocessed.append((raw_text, prepro_sents)) + for example in gold_data: + new_example = Example(doc=example.doc) + proj_heads, deco_deps = projectivize(example.token_annotation.heads, + example.token_annotation.deps) + # set the label to ROOT for each root dependent + deco_deps = ['ROOT' if head == i else deco_deps[i] + for i, head in enumerate(proj_heads)] + # count label frequencies + if label_freq_cutoff > 0: + for label in deco_deps: + if is_decorated(label): + freqs[label] = freqs.get(label, 0) + 1 + proj_token_dict = example.token_annotation.to_dict() + proj_token_dict["heads"] = proj_heads + proj_token_dict["deps"] = deco_deps + new_example.set_token_annotation(**proj_token_dict) + preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) return preprocessed @@ -154,8 +154,7 @@ def _decorate(heads, proj_heads, labels): deco_labels = [] for tokenid, head in enumerate(heads): if head != proj_heads[tokenid]: - deco_labels.append( - '%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) + deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}") else: deco_labels.append(labels[tokenid]) return deco_labels @@ -203,20 +202,20 @@ def _find_new_head(token, headlabel): return token.head -def _filter_labels(gold_tuples, cutoff, freqs): +def _filter_labels(examples, cutoff, freqs): # throw away infrequent decorated labels # can't learn them reliably anyway and keeps label set smaller filtered = [] - for raw_text, sents in gold_tuples: - filtered_sents = [] - for (ids, words, tags, heads, labels, iob), ctnts in sents: - filtered_labels = [] - for label in labels: - if is_decorated(label) and freqs.get(label, 0) < cutoff: - filtered_labels.append(decompose(label)[0]) - else: - filtered_labels.append(label) - filtered_sents.append( - ((ids, words, tags, heads, filtered_labels, iob), ctnts)) - filtered.append((raw_text, filtered_sents)) + for example in examples: + new_example = Example(doc=example.doc) + filtered_labels = [] + for label in example.token_annotation.deps: + if is_decorated(label) and freqs.get(label, 0) < cutoff: + filtered_labels.append(decompose(label)[0]) + else: + filtered_labels.append(label) + filtered_token_dict = example.token_annotation.to_dict() + filtered_token_dict["deps"] = filtered_labels + new_example.set_token_annotation(**filtered_token_dict) + filtered.append(new_example) return filtered diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 2a15a2de1..e472e9861 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,7 +1,4 @@ -# coding: utf-8 # cython: infer_types=True -from __future__ import unicode_literals - import numpy from ..tokens.doc cimport Doc @@ -49,9 +46,9 @@ cdef class StateClass: def print_state(self, words): words = list(words) + ['_'] - top = words[self.S(0)] + '_%d' % self.S_(0).head - second = words[self.S(1)] + '_%d' % self.S_(1).head - third = words[self.S(2)] + '_%d' % self.S_(2).head + top = f"{words[self.S(0)]}_{self.S_(0).head}" + second = f"{words[self.S(1)]}_{self.S_(1).head}" + third = f"{words[self.S(2)]}_{self.S_(2).head}" n0 = words[self.B(0)] n1 = words[self.B(1)] return ' '.join((third, second, top, '|', n0, n1)) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index a5fe55918..bd706a997 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -1,7 +1,6 @@ from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t -from ..typedefs cimport attr_t +from ..typedefs cimport attr_t, weight_t from ..structs cimport TokenC from ..gold cimport GoldParse from ..gold cimport GoldParseC diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 65097f114..6ab83436e 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,12 +1,9 @@ # cython: infer_types=True -# coding: utf-8 -from __future__ import unicode_literals - from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t +from ..typedefs cimport weight_t from thinc.extra.search cimport Beam -from collections import OrderedDict, Counter +from collections import Counter import srsly from . cimport _beam_utils diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 816970e61..b391dd88e 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class @@ -17,11 +14,11 @@ def pytest_runtest_setup(item): # recognize the option we're asking about. To avoid this, we need to # pass a default value. We default to False, i.e., we act like all the # options weren't given. - return item.config.getoption("--%s" % opt, False) + return item.config.getoption(f"--{opt}", False) for opt in ["slow"]: if opt in item.keywords and not getopt(opt): - pytest.skip("need --%s option to run" % opt) + pytest.skip(f"need --{opt} option to run") # Fixtures for language tokenizers (languages sorted alphabetically) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 6c69e699a..766dcb739 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.pipeline import EntityRecognizer from spacy.tokens import Span import pytest diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 7b513cfab..6be6e3867 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 120fb6e28..d986d160c 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokens import Doc diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 86c7fbf72..4323bb736 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,13 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - - import pytest import numpy from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.errors import ModelsWarning -from spacy.attrs import ENT_TYPE, ENT_IOB +from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP from ..util import get_doc @@ -274,6 +270,51 @@ def test_doc_is_nered(en_vocab): assert new_doc.is_nered +def test_doc_from_array_sent_starts(en_vocab): + words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] + heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] + deps = [ + "ROOT", + "dep", + "dep", + "dep", + "dep", + "dep", + "ROOT", + "dep", + "dep", + "dep", + "dep", + ] + doc = Doc(en_vocab, words=words) + for i, (dep, head) in enumerate(zip(deps, heads)): + doc[i].dep_ = dep + doc[i].head = doc[head] + if head == i: + doc[i].is_sent_start = True + doc.is_parsed + + attrs = [SENT_START, HEAD] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + with pytest.raises(ValueError): + new_doc.from_array(attrs, arr) + + attrs = [SENT_START, DEP] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + new_doc.from_array(attrs, arr) + assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] + assert not new_doc.is_parsed + + attrs = [HEAD, DEP] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + new_doc.from_array(attrs, arr) + assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] + assert new_doc.is_parsed + + def test_doc_lang(en_vocab): doc = Doc(en_vocab, words=["Hello", "world"]) assert doc.lang_ == "en" diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 5d570af53..221b6f683 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest @@ -12,22 +9,54 @@ def i_has(en_tokenizer): return doc -def test_token_morph_id(i_has): - assert i_has[0].morph.id - assert i_has[1].morph.id != 0 - assert i_has[0].morph.id != i_has[1].morph.id +def test_token_morph_eq(i_has): + assert i_has[0].morph is not i_has[0].morph + assert i_has[0].morph == i_has[0].morph + assert i_has[0].morph != i_has[1].morph + + +def test_token_morph_key(i_has): + assert i_has[0].morph.key != 0 + assert i_has[1].morph.key != 0 + assert i_has[0].morph.key == i_has[0].morph.key + assert i_has[0].morph.key != i_has[1].morph.key def test_morph_props(i_has): - assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] - assert i_has[0].morph.pron_type_ == "PronType_prs" - assert i_has[1].morph.pron_type == 0 + assert i_has[0].morph.get("PronType") == ["PronType=prs"] + assert i_has[1].morph.get("PronType") == [] def test_morph_iter(i_has): - assert list(i_has[0].morph) == ["PronType_prs"] - assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"] + assert set(i_has[0].morph) == set(["PronType=prs"]) + assert set(i_has[1].morph) == set( + ["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"] + ) def test_morph_get(i_has): - assert i_has[0].morph.get("pron_type") == "PronType_prs" + assert i_has[0].morph.get("PronType") == ["PronType=prs"] + + +def test_morph_set(i_has): + assert i_has[0].morph.get("PronType") == ["PronType=prs"] + # set by string + i_has[0].morph_ = "PronType=unk" + assert i_has[0].morph.get("PronType") == ["PronType=unk"] + # set by string, fields are alphabetized + i_has[0].morph_ = "PronType=123|NounType=unk" + assert i_has[0].morph_ == "NounType=unk|PronType=123" + # set by dict + i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"} + assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ" + # set by string with multiple values, fields and values are alphabetized + i_has[0].morph_ = "BType=c|AType=b,a" + assert i_has[0].morph_ == "AType=a,b|BType=c" + # set by dict with multiple values, fields and values are alphabetized + i_has[0].morph_ = {"AType": "b,a", "BType": "c"} + assert i_has[0].morph_ == "AType=a,b|BType=c" + + +def test_morph_str(i_has): + assert str(i_has[0].morph) == "PronType=prs" + assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin" diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py index 2b6970a38..28cb66714 100644 --- a/spacy/tests/doc/test_pickle_doc.py +++ b/spacy/tests/doc/test_pickle_doc.py @@ -1,8 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.language import Language -from spacy.compat import pickle, unicode_ +from spacy.compat import pickle def test_pickle_single_doc(): @@ -16,9 +13,9 @@ def test_pickle_single_doc(): def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): - _ = nlp.vocab[unicode_(i)] # noqa: F841 + _ = nlp.vocab[str(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) - docs = list(nlp.pipe(unicode_(i) for i in range(100))) + docs = list(nlp.pipe(str(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 5bdf78f39..5e564d1f2 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import LEMMA from spacy.vocab import Vocab @@ -11,7 +8,12 @@ from ..util import get_doc def test_doc_retokenize_merge(en_tokenizer): text = "WKRO played songs by the beach boys all night" - attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + attrs = { + "tag": "NAMED", + "lemma": "LEMMA", + "ent_type": "TYPE", + "morph": "Number=Plur", + } doc = en_tokenizer(text) assert len(doc) == 9 with doc.retokenize() as retokenizer: @@ -21,9 +23,11 @@ def test_doc_retokenize_merge(en_tokenizer): assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " assert doc[4].tag_ == "NAMED" + assert doc[4].morph_ == "Number=Plur" assert doc[5].text == "all night" assert doc[5].text_with_ws == "all night" assert doc[5].tag_ == "NAMED" + assert doc[5].morph_ == "Number=Plur" def test_doc_retokenize_merge_children(en_tokenizer): diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index d074fddc6..5f40da425 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokens import Doc, Token @@ -25,15 +22,18 @@ def test_doc_retokenize_split(en_vocab): "tag": ["NNP"] * 2, "lemma": ["Los", "Angeles"], "ent_type": ["GPE"] * 2, + "morph": ["Number=Sing"] * 2, }, ) assert len(doc) == 4 assert doc[0].text == "Los" assert doc[0].head.text == "Angeles" assert doc[0].idx == 0 + assert doc[0].morph_ == "Number=Sing" assert doc[1].idx == 3 assert doc[1].text == "Angeles" assert doc[1].head.text == "start" + assert doc[1].morph_ == "Number=Sing" assert doc[2].text == "start" assert doc[2].head.text == "." assert doc[3].text == "." diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 01bb93c50..d7b91d476 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span @@ -279,3 +276,12 @@ def test_filter_spans(doc): assert len(filtered[1]) == 5 assert filtered[0].start == 1 and filtered[0].end == 4 assert filtered[1].start == 5 and filtered[1].end == 10 + + +def test_span_eq_hash(doc, doc_not_parsed): + assert doc[0:2] == doc[0:2] + assert doc[0:2] != doc[1:3] + assert doc[0:2] != doc_not_parsed[0:2] + assert hash(doc[0:2]) == hash(doc[0:2]) + assert hash(doc[0:2]) != hash(doc[1:3]) + assert hash(doc[0:2]) != hash(doc_not_parsed[0:2]) diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index a063a6569..da3bc7dbb 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,9 +1,4 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest -from spacy.cli._schemas import TRAINING_SCHEMA -from spacy.util import get_json_validator, validate_json from spacy.tokens import Doc from ..util import get_doc @@ -58,10 +53,3 @@ def test_doc_to_json_underscore_error_serialize(doc): Doc.set_extension("json_test4", method=lambda doc: doc.text) with pytest.raises(ValueError): doc.to_json(underscore=["json_test4"]) - - -def test_doc_to_json_valid_training(doc): - json_doc = doc.to_json() - validator = get_json_validator(TRAINING_SCHEMA) - errors = validate_json([json_doc], validator) - assert not errors diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index bff2a95c6..cff1d3327 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 2877bfeea..352460581 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from mock import Mock from spacy.tokens import Doc, Span, Token diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py index 3cfc380d2..125220caf 100644 --- a/spacy/tests/lang/ar/test_exceptions.py +++ b/spacy/tests/lang/ar/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py index 109c3721a..c5ab376f1 100644 --- a/spacy/tests/lang/ar/test_text.py +++ b/spacy/tests/lang/ar/test_text.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - def test_ar_tokenizer_handles_long_text(ar_tokenizer): text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها، diff --git a/spacy/tests/lang/bn/test_tokenizer.py b/spacy/tests/lang/bn/test_tokenizer.py index 62dd52778..5b18c5269 100644 --- a/spacy/tests/lang/bn/test_tokenizer.py +++ b/spacy/tests/lang/bn/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py index 56156c328..71098f094 100644 --- a/spacy/tests/lang/ca/test_exception.py +++ b/spacy/tests/lang/ca/test_exception.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 4583a62b9..83a75f056 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 1506016d4..38f5fc708 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -1,10 +1,4 @@ -# coding: utf-8 - """Test that longer and mixed texts are tokenized correctly.""" - - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index a522ab5e8..603378ea7 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_prefix_suffix_infix.py b/spacy/tests/lang/da/test_prefix_suffix_infix.py index 8b43bf360..e36b3cdb9 100644 --- a/spacy/tests/lang/da/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/da/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py index 07b134e2d..3c6cca5ac 100644 --- a/spacy/tests/lang/da/test_text.py +++ b/spacy/tests/lang/da/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.da.lex_attrs import like_num diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index 2e065870e..a4614f6c4 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py index 5c8694da3..c897dcf2f 100644 --- a/spacy/tests/lang/de/test_parser.py +++ b/spacy/tests/lang/de/test_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py index 13e109395..82bd8ed69 100644 --- a/spacy/tests/lang/de/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index b3fb1eaa5..22711763e 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_exception.py b/spacy/tests/lang/el/test_exception.py index b8d10fb69..a4656ea98 100644 --- a/spacy/tests/lang/el/test_exception.py +++ b/spacy/tests/lang/el/test_exception.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_text.py b/spacy/tests/lang/el/test_text.py index a6395ab4a..1b3ef6182 100644 --- a/spacy/tests/lang/el/test_text.py +++ b/spacy/tests/lang/el/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index 7f939011f..f5302cb31 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.lang.en import English diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 6285a9408..b2e941dab 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py index 8a7bc0323..93daeec30 100644 --- a/spacy/tests/lang/en/test_indices.py +++ b/spacy/tests/lang/en/test_indices.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - def test_en_simple_punct(en_tokenizer): text = "to walk, do foo" tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 7dc47f9cc..6739b5137 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import numpy from spacy.attrs import HEAD, DEP from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py index ce696bc25..057143696 100644 --- a/spacy/tests/lang/en/test_parser.py +++ b/spacy/tests/lang/en/test_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 3dccd6bcf..9efcc1015 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest @@ -111,7 +108,6 @@ def test_en_tokenizer_splits_double_hyphen_infix(en_tokenizer): assert tokens[9].text == "people" -@pytest.mark.xfail def test_en_tokenizer_splits_period_abbr(en_tokenizer): text = "Today is Tuesday.Mr." tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py index 61274cf14..1d10478a1 100644 --- a/spacy/tests/lang/en/test_punct.py +++ b/spacy/tests/lang/en/test_punct.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import compile_prefix_regex from spacy.lang.punctuation import TOKENIZER_PREFIXES @@ -82,7 +79,6 @@ def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail @pytest.mark.parametrize("text", ["Hello''"]) def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text): tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 40bd110e8..ba7b2f2cf 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc, apply_transition_sequence diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py index 567fd5a44..d9eced2ff 100644 --- a/spacy/tests/lang/en/test_tagger.py +++ b/spacy/tests/lang/en/test_tagger.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index a7ebde989..c5d56d885 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en.lex_attrs import like_num diff --git a/spacy/tests/lang/es/test_exception.py b/spacy/tests/lang/es/test_exception.py index 8d6164058..90d897a4c 100644 --- a/spacy/tests/lang/es/test_exception.py +++ b/spacy/tests/lang/es/test_exception.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index acd572b48..af7b0212d 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fi/test_text.py b/spacy/tests/lang/fi/test_text.py index 2dd92597e..dbb67ad7a 100644 --- a/spacy/tests/lang/fi/test_text.py +++ b/spacy/tests/lang/fi/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index aab063982..bcd62f239 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest @@ -22,16 +19,10 @@ HYPHENATED_TESTS = [ ABBREVIATION_INFLECTION_TESTS = [ ( "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", - ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] + ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"], ), - ( - "ALV:n osuus on 24 %.", - ["ALV:n", "osuus", "on", "24", "%", "."] - ), - ( - "Hiihtäjä oli kilpailun 14:s.", - ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] - ) + ("ALV:n osuus on 24 %.", ["ALV:n", "osuus", "on", "24", "%", "."]), + ("Hiihtäjä oli kilpailun 14:s.", ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]), ] diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 93dbf0993..98d318f6e 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py index ca6bdbd87..01d50b0a6 100644 --- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.lang.punctuation import TOKENIZER_INFIXES diff --git a/spacy/tests/lang/fr/test_text.py b/spacy/tests/lang/fr/test_text.py index 24b4c4532..01231f593 100644 --- a/spacy/tests/lang/fr/test_text.py +++ b/spacy/tests/lang/fr/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.fr.lex_attrs import like_num diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 29bc1c759..78127ef7c 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/he/test_tokenizer.py b/spacy/tests/lang/he/test_tokenizer.py index f138ec6e7..3131014a3 100644 --- a/spacy/tests/lang/he/test_tokenizer.py +++ b/spacy/tests/lang/he/test_tokenizer.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index 2fceece49..fd3acd0a0 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest @@ -297,12 +294,7 @@ WIKI_TESTS = [ ] EXTRA_TESTS = ( - DOT_TESTS - + QUOTE_TESTS - + NUMBER_TESTS - + HYPHEN_TESTS - + WIKI_TESTS - + TYPO_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS ) # normal: default tests + 10% of extra tests @@ -311,7 +303,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0]) # slow: remaining 90% of extra tests SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] -TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS]) +TESTS.extend( + [ + pytest.param(x[0], x[1], marks=pytest.mark.slow()) + if not isinstance(x[0], tuple) + else x + for x in SLOW_TESTS + ] +) @pytest.mark.parametrize("text,expected_tokens", TESTS) diff --git a/spacy/tests/lang/id/test_prefix_suffix_infix.py b/spacy/tests/lang/id/test_prefix_suffix_infix.py index e86a98ee3..2a81dab01 100644 --- a/spacy/tests/lang/id/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_text.py b/spacy/tests/lang/id/test_text.py index 915d268ae..ed6487b68 100644 --- a/spacy/tests/lang/id/test_text.py +++ b/spacy/tests/lang/id/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.id.lex_attrs import like_num diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py index f84351fd7..46f66b5e6 100644 --- a/spacy/tests/lang/it/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..4cb3110b3 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..481f346bb 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 42c306c11..7782ca4bc 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index b8fe7959c..eac309857 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest # fmt: off diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 7ca2394b7..5b5005ae7 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lb/test_prefix_suffix_infix.py b/spacy/tests/lang/lb/test_prefix_suffix_infix.py index d85f932be..3958d1543 100644 --- a/spacy/tests/lang/lb/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/lb/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py index 36464b379..b0ba76b6b 100644 --- a/spacy/tests/lang/lb/test_text.py +++ b/spacy/tests/lang/lb/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index cac32aa4d..8d9201cd9 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nb/test_tokenizer.py b/spacy/tests/lang/nb/test_tokenizer.py index f72d310e8..2da6e8d40 100644 --- a/spacy/tests/lang/nb/test_tokenizer.py +++ b/spacy/tests/lang/nb/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nl/test_text.py b/spacy/tests/lang/nl/test_text.py index 4045b1c39..8bc72cc6d 100644 --- a/spacy/tests/lang/nl/test_text.py +++ b/spacy/tests/lang/nl/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.nl.lex_attrs import like_num diff --git a/spacy/tests/lang/pl/test_text.py b/spacy/tests/lang/pl/test_text.py index ec9b18084..e8654a498 100644 --- a/spacy/tests/lang/pl/test_text.py +++ b/spacy/tests/lang/pl/test_text.py @@ -1,9 +1,4 @@ -# coding: utf-8 """Words like numbers are recognized correctly.""" - - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py index 9d0034589..a04b4fdcb 100644 --- a/spacy/tests/lang/pl/test_tokenizer.py +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest DOT_TESTS = [ diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py index 39dfff2c1..3a9162b80 100644 --- a/spacy/tests/lang/pt/test_text.py +++ b/spacy/tests/lang/pt/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.pt.lex_attrs import like_num diff --git a/spacy/tests/lang/ro/test_tokenizer.py b/spacy/tests/lang/ro/test_tokenizer.py index a327174e5..64c072470 100644 --- a/spacy/tests/lang/ro/test_tokenizer.py +++ b/spacy/tests/lang/ro/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ru/test_exceptions.py b/spacy/tests/lang/ru/test_exceptions.py index a8f0c3429..4fb417df8 100644 --- a/spacy/tests/lang/ru/test_exceptions.py +++ b/spacy/tests/lang/ru/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index b228fded8..40dcf4cf8 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc diff --git a/spacy/tests/lang/ru/test_text.py b/spacy/tests/lang/ru/test_text.py index c5bff6973..b0eaf66bb 100644 --- a/spacy/tests/lang/ru/test_text.py +++ b/spacy/tests/lang/ru/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.ru.lex_attrs import like_num diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py index 5507f9f09..1cfdc50ee 100644 --- a/spacy/tests/lang/ru/test_tokenizer.py +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest @@ -80,7 +77,6 @@ def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail @pytest.mark.parametrize("text", ["Тест''"]) def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text): tokens = ru_tokenizer(text) diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index 285e99996..fa92e5e2d 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py index c4672b3ef..fdcf790d8 100644 --- a/spacy/tests/lang/sr/test_tokenizer.py +++ b/spacy/tests/lang/sr/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest @@ -80,7 +77,6 @@ def test_sr_tokenizer_splits_open_appostrophe(sr_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail @pytest.mark.parametrize("text", ["Тест''"]) def test_sr_tokenizer_splits_double_end_quote(sr_tokenizer, text): tokens = sr_tokenizer(text) diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index c977a4183..5d3acf3d5 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index ac7c066ba..ad335c317 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index f3fdd9a9e..bbb0ff415 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py index 9ea1851ae..1e26c45bc 100644 --- a/spacy/tests/lang/sv/test_text.py +++ b/spacy/tests/lang/sv/test_text.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - def test_sv_tokenizer_handles_long_text(sv_tokenizer): text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, höet var uppställt i stackar nere vid den gröna ängen, och där gick storken på sina långa, diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py index 894b5aa6a..8871f4414 100644 --- a/spacy/tests/lang/sv/test_tokenizer.py +++ b/spacy/tests/lang/sv/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 4bb5aac70..b39109455 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 5c701fc22..de1871e64 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class diff --git a/spacy/tests/lang/th/test_tokenizer.py b/spacy/tests/lang/th/test_tokenizer.py index 265c7753d..1e1ba52dc 100644 --- a/spacy/tests/lang/th/test_tokenizer.py +++ b/spacy/tests/lang/th/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/tt/test_tokenizer.py b/spacy/tests/lang/tt/test_tokenizer.py index f6c68a401..246d2824d 100644 --- a/spacy/tests/lang/tt/test_tokenizer.py +++ b/spacy/tests/lang/tt/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py index f744b32b0..eb647a041 100644 --- a/spacy/tests/lang/uk/test_tokenizer.py +++ b/spacy/tests/lang/uk/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/uk/test_tokenizer_exc.py b/spacy/tests/lang/uk/test_tokenizer_exc.py index 328e1d287..4fb4a6b31 100644 --- a/spacy/tests/lang/uk/test_tokenizer_exc.py +++ b/spacy/tests/lang/uk/test_tokenizer_exc.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ur/test_prefix_suffix_infix.py b/spacy/tests/lang/ur/test_prefix_suffix_infix.py index de11c9b34..e9f3272f4 100644 --- a/spacy/tests/lang/ur/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ur/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py index 546e79182..5da831cf8 100644 --- a/spacy/tests/lang/ur/test_text.py +++ b/spacy/tests/lang/ur/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py index ce6408b67..48b689f3d 100644 --- a/spacy/tests/lang/yo/test_text.py +++ b/spacy/tests/lang/yo/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.yo.lex_attrs import like_num diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py index 235f597a5..d9a65732e 100644 --- a/spacy/tests/lang/zh/test_text.py +++ b/spacy/tests/lang/zh/test_text.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - import pytest diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 36d94beb5..f71785337 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e4584d03a..3900f1e68 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from mock import Mock @@ -182,7 +179,7 @@ def test_matcher_match_one_plus(matcher): doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 - pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}] + pattern = [{"ORTH": "Philippe"}, {"ORTH": "Philippe", "OP": "+"}] matcher.add("KleenePhilippe", [pattern]) m = matcher(doc) assert len(m) == 1 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 240ace537..a2b2cd83f 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re @@ -9,18 +6,18 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span -pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}] -pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}] -pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}] +pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] +pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}] +pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] pattern4 = [ - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, ] pattern5 = [ {"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, ] re_pattern1 = "AA*" diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 2db2f9eb3..ade724d05 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -1,11 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.matcher import Matcher -from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA from spacy.errors import MatchPatternError -from spacy.util import get_json_validator, validate_json +from spacy.schemas import validate_token_pattern # (pattern, num errors with validation, num errors identified with minimal # checks) @@ -18,12 +14,12 @@ TEST_PATTERNS = [ ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0), + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) # Bad patterns not flagged with minimal checks ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0), - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0), - ([{"LENGTH": {"VALUE": 5}}], 1, 0), - ([{"TEXT": {"VALUE": "foo"}}], 1, 0), + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 4, 0), # prev: (2, 0) + ([{"LENGTH": {"VALUE": 5}}], 2, 0), # prev: (1, 0) + ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) ([{"IS_DIGIT": -1}], 1, 0), ([{"ORTH": -1}], 1, 0), # Good patterns @@ -34,15 +30,9 @@ TEST_PATTERNS = [ ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0), ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0), ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0), + ([{"orth": "foo"}], 0, 0), # prev: xfail ] -XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)] - - -@pytest.fixture -def validator(): - return get_json_validator(TOKEN_PATTERN_SCHEMA) - @pytest.mark.parametrize( "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] @@ -54,15 +44,8 @@ def test_matcher_pattern_validation(en_vocab, pattern): @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS) -def test_pattern_validation(validator, pattern, n_errors, _): - errors = validate_json(pattern, validator) - assert len(errors) == n_errors - - -@pytest.mark.xfail -@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS) -def test_xfail_pattern_validation(validator, pattern, n_errors, _): - errors = validate_json(pattern, validator) +def test_pattern_validation(pattern, n_errors, _): + errors = validate_token_pattern(pattern) assert len(errors) == n_errors diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7a6585e06..23cd80d1d 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from mock import Mock from spacy.matcher import PhraseMatcher diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py new file mode 100644 index 000000000..9486cad45 --- /dev/null +++ b/spacy/tests/morphology/test_morph_converters.py @@ -0,0 +1,25 @@ +from spacy.morphology import Morphology + + +def test_feats_converters(): + feats = "Case=dat,gen|Number=sing" + feats_dict = {"Case": "dat,gen", "Number": "sing"} + feats_list = feats.split(Morphology.FEATURE_SEP) + + # simple conversions + assert Morphology.list_to_feats(feats_list) == feats + assert Morphology.dict_to_feats(feats_dict) == feats + assert Morphology.feats_to_dict(feats) == feats_dict + + # roundtrips + assert Morphology.dict_to_feats(Morphology.feats_to_dict(feats)) == feats + assert Morphology.feats_to_dict(Morphology.dict_to_feats(feats_dict)) == feats_dict + + # unsorted input is normalized + unsorted_feats = "Number=sing|Case=gen,dat" + unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"} + unsorted_feats_list = feats.split(Morphology.FEATURE_SEP) + assert Morphology.feats_to_dict(unsorted_feats) == feats_dict + assert Morphology.dict_to_feats(unsorted_feats_dict) == feats + assert Morphology.list_to_feats(unsorted_feats_list) == feats + assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 41f807143..f644a5867 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.morphology import Morphology from spacy.strings import StringStore, get_string_id @@ -19,32 +16,37 @@ def test_init(morphology): def test_add_morphology_with_string_names(morphology): - morphology.add({"Case_gen", "Number_sing"}) + morphology.add({"Case": "gen", "Number": "sing"}) def test_add_morphology_with_int_ids(morphology): - morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) + morphology.strings.add("Case") + morphology.strings.add("gen") + morphology.strings.add("Number") + morphology.strings.add("sing") + morphology.add( + { + get_string_id("Case"): get_string_id("gen"), + get_string_id("Number"): get_string_id("sing"), + } + ) def test_add_morphology_with_mix_strings_and_ints(morphology): - morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"}) + morphology.strings.add("PunctSide") + morphology.strings.add("ini") + morphology.add( + {get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"} + ) def test_morphology_tags_hash_distinctly(morphology): - tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"}) - tag2 = morphology.add({"Case_gen", "Number_sing"}) + tag1 = morphology.add({"PunctSide": "ini", "VerbType": "aux"}) + tag2 = morphology.add({"Case": "gen", "Number": "sing"}) assert tag1 != tag2 def test_morphology_tags_hash_independent_of_order(morphology): - tag1 = morphology.add({"Case_gen", "Number_sing"}) - tag2 = morphology.add({"Number_sing", "Case_gen"}) + tag1 = morphology.add({"Case": "gen", "Number": "sing"}) + tag2 = morphology.add({"Number": "sing", "Case": "gen"}) assert tag1 == tag2 - - -def test_update_morphology_tag(morphology): - tag1 = morphology.add({"Case_gen"}) - tag2 = morphology.update(tag1, {"Number_sing"}) - assert tag1 != tag2 - tag3 = morphology.add({"Number_sing", "Case_gen"}) - assert tag2 == tag3 diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 4ab9c1e70..fe847a6ae 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps +from thinc.api import Adam, NumpyOps from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab @@ -31,27 +27,27 @@ def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.begin_training([], **parser.cfg) - sgd = Adam(NumpyOps(), 0.001) + sgd = Adam(0.001, ops=NumpyOps()) for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) - parser.update([doc], [gold], sgd=sgd, losses=losses) + parser.update((doc, gold), sgd=sgd, losses=losses) return parser def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") - sgd = Adam(NumpyOps(), 0.001) - for i in range(10): + sgd = Adam(0.001, ops=NumpyOps()) + for i in range(100): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse( doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"] ) - parser.update([doc], [gold], sgd=sgd, losses=losses) + parser.update((doc, gold), sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 41b7a4861..dd593f7d3 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.pipeline import DependencyParser @@ -130,18 +127,25 @@ annot_tuples = [ def test_get_oracle_actions(): + ids, words, tags, heads, deps, ents = [], [], [], [], [], [] + for id_, word, tag, head, dep, ent in annot_tuples: + ids.append(id_) + words.append(word) + tags.append(tag) + heads.append(head) + deps.append(dep) + ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) parser = DependencyParser(doc.vocab) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") - for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples): + for i, (head, dep) in enumerate(zip(heads, deps)): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) - ids, words, tags, heads, deps, ents = zip(*annot_tuples) heads, deps = projectivize(heads, deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) parser.moves.preprocess_gold(gold) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8329391ca..9a4d21a8d 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English @@ -10,6 +7,11 @@ from spacy.syntax.ner import BiluoPushDown from spacy.gold import GoldParse from spacy.tokens import Doc +TRAIN_DATA = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), +] + @pytest.fixture def vocab(): @@ -266,7 +268,7 @@ def test_change_number_features(): nlp.add_pipe(ner) ner.add_label("PERSON") nlp.begin_training() - assert ner.model.lower.nF == ner.nr_feature + assert ner.model.lower.get_dim("nF") == ner.nr_feature # Test we can change it nlp = English() ner = nlp.create_pipe("ner") @@ -275,11 +277,36 @@ def test_change_number_features(): nlp.begin_training( component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}} ) - assert ner.model.lower.nF == 3 + assert ner.model.lower.get_dim("nF") == 3 # Test the model runs nlp("hello world") +def test_overfitting(): + # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly + nlp = English() + ner = nlp.create_pipe("ner") + for _, annotations in TRAIN_DATA: + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + nlp.add_pipe(ner) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.00001 + + # test the trained model + test_text = "I like London." + doc = nlp(test_text) + ents = doc.ents + + assert len(ents) == 1 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 062c76ae3..2470982d3 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,8 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest -from spacy._ml import Tok2Vec +from spacy.ml.component_models import Tok2Vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser @@ -23,7 +20,9 @@ def arc_eager(vocab): @pytest.fixture def tok2vec(): - return Tok2Vec(8, 100) + tok2vec = Tok2Vec(8, 100) + tok2vec.initialize() + return tok2vec @pytest.fixture @@ -33,7 +32,7 @@ def parser(vocab, arc_eager): @pytest.fixture def model(arc_eager, tok2vec): - return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0] + return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0] @pytest.fixture @@ -56,7 +55,7 @@ def test_build_model(parser): def test_predict_doc(parser, tok2vec, model, doc): - doc.tensor = tok2vec([doc])[0] + doc.tensor = tok2vec.predict([doc])[0] parser.model = model parser(doc) @@ -64,10 +63,11 @@ def test_predict_doc(parser, tok2vec, model, doc): def test_update_doc(parser, model, doc, gold): parser.model = model - def optimize(weights, gradient, key=None): + def optimize(key, weights, gradient): weights -= 0.001 * gradient + return weights, gradient - parser.update([doc], [gold], sgd=optimize) + parser.update((doc, gold), sgd=optimize) @pytest.mark.xfail @@ -83,4 +83,4 @@ def test_update_doc_beam(parser, model, doc, gold): def optimize(weights, gradient, key=None): weights -= 0.001 * gradient - parser.update_beam([doc], [gold], sgd=optimize) + parser.update_beam((doc, gold), sgd=optimize) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 9dca99255..24997e47c 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 8bf8111c1..86d9a0180 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc from spacy.syntax.nonproj import is_nonproj_tree diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index fb5301718..1d3f522c9 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,10 +1,25 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest +from spacy.lang.en import English from ..util import get_doc, apply_transition_sequence +TRAIN_DATA = [ + ( + "They trade mortgage-backed securities.", + { + "heads": [1, 1, 4, 4, 5, 1, 1], + "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], + }, + ), + ( + "I like London and Berlin.", + { + "heads": [1, 1, 1, 2, 2, 1], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + }, + ), +] + def test_parser_root(en_tokenizer): text = "i don't have other assistance" @@ -165,3 +180,27 @@ def test_parser_set_sent_starts(en_vocab): for sent in doc.sents: for token in sent: assert token.head in sent + + +def test_overfitting(): + # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly + nlp = English() + parser = nlp.create_pipe("parser") + for _, annotations in TRAIN_DATA: + for dep in annotations.get("deps", []): + parser.add_label(dep) + nlp.add_pipe(parser) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["parser"] < 0.00001 + + # test the trained model + test_text = "I like securities." + doc = nlp(test_text) + + assert doc[0].dep_ is "nsubj" + assert doc[2].dep_ is "dobj" + assert doc[3].dep_ is "punct" diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index eb206458e..ed95718f1 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ..util import get_doc diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 70beb2f60..c6c1240a8 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps +from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab @@ -24,13 +20,13 @@ def parser(vocab): # parser.add_label('right') parser.add_label("left") parser.begin_training([], **parser.cfg) - sgd = Adam(NumpyOps(), 0.001) + sgd = Adam(0.001) for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) - parser.update([doc], [gold], sgd=sgd, losses=losses) + parser.update((doc, gold), sgd=sgd, losses=losses) return parser diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 945173faf..59ae4e629 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens.doc import Doc diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 198f11bcd..5c246538c 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,11 +1,7 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy.language from spacy.language import Language, component from spacy.analysis import print_summary, validate_attrs from spacy.analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.compat import is_python2 from mock import Mock, ANY import pytest @@ -17,8 +13,7 @@ def test_component_decorator_function(): return doc assert test_component.name == "test" - if not is_python2: - assert test_component.__doc__ == "docstring" + assert test_component.__doc__ == "docstring" assert test_component("foo") == "foo" @@ -45,13 +40,12 @@ def test_component_decorator_class(): assert test_component("foo") == "foo" assert hasattr(test_component, "custom") assert test_component.custom("bar") == "bar" - if not is_python2: - assert TestComponent.__doc__ == "docstring1" - assert TestComponent.__call__.__doc__ == "docstring2" - assert TestComponent.custom.__doc__ == "docstring3" - assert test_component.__doc__ == "docstring1" - assert test_component.__call__.__doc__ == "docstring2" - assert test_component.custom.__doc__ == "docstring3" + assert TestComponent.__doc__ == "docstring1" + assert TestComponent.__call__.__doc__ == "docstring2" + assert TestComponent.custom.__doc__ == "docstring3" + assert test_component.__doc__ == "docstring1" + assert test_component.__call__.__doc__ == "docstring2" + assert test_component.custom.__doc__ == "docstring3" def test_component_decorator_assigns(): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 8023f72a6..9ff5f8194 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.kb import KnowledgeBase diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 3b46baa9b..b04569e22 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Span from spacy.language import Language @@ -152,10 +149,5 @@ def test_entity_ruler_validate(nlp): def test_entity_ruler_properties(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - assert sorted(ruler.labels) == sorted([ - "HELLO", - "BYE", - "COMPLEX", - "TECH_ORG" - ]) + assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) assert sorted(ruler.ent_ids) == ["a1", "a2"] diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py index 5efcc319a..0a9a4d3c9 100644 --- a/spacy/tests/pipeline/test_factories.py +++ b/spacy/tests/pipeline/test_factories.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.tokens import Span diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py index 5b5fcd2fd..ca983267f 100644 --- a/spacy/tests/pipeline/test_functions.py +++ b/spacy/tests/pipeline/test_functions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.pipeline.functions import merge_subtokens from ..util import get_doc diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 27fb57b18..e2fb02a2a 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language @@ -118,7 +115,7 @@ def test_disable_pipes_list_arg(nlp): @pytest.mark.parametrize("n_pipes", [100]) def test_add_lots_of_pipes(nlp, n_pipes): for i in range(n_pipes): - nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i) + nlp.add_pipe(lambda doc: doc, name=f"pipe_{i}") assert len(nlp.pipe_names) == n_pipes diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index d690958cc..0432b00e0 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import spacy from spacy.pipeline import Sentencizer @@ -27,6 +24,12 @@ def test_sentencizer_pipe(): sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2 + for ex in nlp.pipe(texts, as_example=True): + doc = ex.doc + assert doc.is_sentenced + sent_starts = [t.is_sent_start for t in doc] + assert sent_starts == [True, False, True, False, False, False, False] + assert len(list(doc.sents)) == 2 def test_sentencizer_empty_docs(): diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index a5bda9090..366cd4f1a 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language @@ -11,3 +8,35 @@ def test_label_types(): nlp.get_pipe("tagger").add_label("A") with pytest.raises(ValueError): nlp.get_pipe("tagger").add_label(9) + + +TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} + +TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + + +def test_overfitting(): + # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly + nlp = Language() + tagger = nlp.create_pipe("tagger") + for tag, values in TAG_MAP.items(): + tagger.add_label(tag, values) + nlp.add_pipe(tagger) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["tagger"] < 0.00001 + + # test the trained model + test_text = "I like blue eggs" + doc = nlp(test_text) + + assert doc[0].tag_ is "N" + assert doc[1].tag_ is "V" + assert doc[2].tag_ is "J" + assert doc[3].tag_ is "N" diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index b7db85056..558d09e40 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import random import numpy.random @@ -9,6 +6,11 @@ from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.gold import GoldParse +TRAIN_DATA = [ + ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), + ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), +] + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): @@ -24,7 +26,7 @@ def test_simple_train(): ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: - nlp.update([text], [{"cats": {"answer": answer}}]) + nlp.update((text, {"cats": {"answer": answer}})) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5 @@ -70,3 +72,26 @@ def test_label_types(): nlp.get_pipe("textcat").add_label("answer") with pytest.raises(ValueError): nlp.get_pipe("textcat").add_label(9) + + +def test_overfitting(): + # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly + nlp = Language() + textcat = nlp.create_pipe("textcat") + for _, annotations in TRAIN_DATA: + for label, value in annotations.get("cats").items(): + textcat.add_label(label) + nlp.add_pipe(textcat) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["textcat"] < 0.00001 + + # test the trained model + test_text = "I am happy." + doc = nlp(test_text) + cats = doc.cats + assert cats["POSITIVE"] > 0.9 + assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 6d88d68c2..bfca72853 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import random from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER -from spacy.symbols import POS, VERB, VerbForm_inf +from spacy.symbols import POS, VERB from spacy.vocab import Vocab from spacy.language import Language from spacy.lemmatizer import Lemmatizer @@ -167,7 +164,7 @@ def test_issue590(en_vocab): def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] - tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} + tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) @@ -451,7 +448,7 @@ def test_issue999(train_data): for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: - nlp.update([raw_text], [{"entities": entity_offsets}]) + nlp.update((raw_text, {"entities": entity_offsets})) with make_tempdir() as model_dir: nlp.to_disk(model_dir) diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 924c5aa3e..aaff951e5 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.tokens import Doc @@ -11,7 +8,7 @@ from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups -from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part +from spacy.symbols import ORTH, LEMMA, POS, VERB def test_issue1061(): @@ -91,7 +88,7 @@ def test_issue1375(): def test_issue1387(): - tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} + tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}} lookups = Lookups() lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index e498417d1..2bfdbd7c3 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -1,10 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import gc import numpy import copy + +from spacy.gold import Example from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop @@ -270,9 +269,11 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): ner = EntityRecognizer(Vocab()) - entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) - gold_parses = [(None, [(entry, None)])] - ner.moves.get_actions(gold_parses=gold_parses) + example = Example(doc=None) + example.set_token_annotation( + ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] + ) + ner.moves.get_actions(gold_parses=[example]) def test_issue1971(en_vocab): diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index e95c1a9b9..2c25b6d73 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 73ff7376a..c4f5e8599 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy import displacy from spacy.lang.en import English @@ -11,7 +8,7 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.compat import pickle -from spacy._ml import link_vectors_to_models +from spacy.util import link_vectors_to_models import numpy import random @@ -157,7 +154,7 @@ def test_issue2800(): losses = {} random.shuffle(train_data) for statement, entities in train_data: - nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) + nlp.update((statement, entities), sgd=optimizer, losses=losses, drop=0.5) def test_issue2822(it_tokenizer): diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index d05759c31..cc893e472 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English from spacy.lang.de import German @@ -9,11 +6,10 @@ from spacy.matcher import Matcher, PhraseMatcher from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.attrs import ENT_IOB, ENT_TYPE -from spacy.compat import pickle, is_python2, unescape_unicode +from spacy.compat import pickle from spacy import displacy from spacy.util import decaying import numpy -import re from spacy.vectors import Vectors from ..util import get_doc @@ -211,73 +207,6 @@ def test_issue3345(): assert ner.moves.is_valid(state, "B-GPE") -if is_python2: - # If we have this test in Python 3, pytest chokes, as it can't print the - # string above in the xpass message. - prefix_search = ( - b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" - b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" - b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" - b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" - b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" - b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" - b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" - b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" - b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" - b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" - b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" - b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" - b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" - b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" - b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" - b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" - b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" - b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" - b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" - b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" - b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" - b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" - b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" - b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" - b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" - b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" - b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" - b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" - b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" - b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" - b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" - b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" - b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" - b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" - b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" - b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" - b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" - b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" - b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" - b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" - b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" - b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" - b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" - b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" - b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" - b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" - b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" - b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" - b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" - b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" - b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" - b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" - b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" - b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" - b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" - b"\\U0001FA60-\\U0001FA6D]" - ) - - def test_issue3356(): - pattern = re.compile(unescape_unicode(prefix_search.decode("utf8"))) - assert not pattern.search("hello") - - def test_issue3410(): texts = ["Hello world", "This is a test"] nlp = English() diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 35731ac12..3d8ee9922 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py index c6f513730..aa77028fb 100644 --- a/spacy/tests/regression/test_issue3526.py +++ b/spacy/tests/regression/test_issue3526.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Span from spacy.language import Language diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py index 7b9d0bd2a..4c65a5bfe 100644 --- a/spacy/tests/regression/test_issue3531.py +++ b/spacy/tests/regression/test_issue3531.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy import displacy diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py index 19d89c797..be9e04b0b 100644 --- a/spacy/tests/regression/test_issue3540.py +++ b/spacy/tests/regression/test_issue3540.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import Doc import numpy as np diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py index 587b3a857..b3af59c2e 100644 --- a/spacy/tests/regression/test_issue3549.py +++ b/spacy/tests/regression/test_issue3549.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.matcher import Matcher from spacy.errors import MatchPatternError diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py index 8444f11f2..de047bcbc 100644 --- a/spacy/tests/regression/test_issue3555.py +++ b/spacy/tests/regression/test_issue3555.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc, Token from spacy.matcher import Matcher diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index 3c4836264..120cea1d2 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.util import minibatch, compounding @@ -35,17 +32,12 @@ def test_issue3611(): # training the network with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]): - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(X=x_train, Y=y_train) for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - docs=texts, - golds=annotations, - sgd=optimizer, - drop=0.1, - losses=losses, + examples=batch, sgd=optimizer, drop=0.1, losses=losses, ) diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py index d935db17f..51561b3ac 100644 --- a/spacy/tests/regression/test_issue3625.py +++ b/spacy/tests/regression/test_issue3625.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.hi import Hindi diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py index 37d15a5cf..ab5250edf 100644 --- a/spacy/tests/regression/test_issue3803.py +++ b/spacy/tests/regression/test_issue3803.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.es import Spanish diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py index fe722a681..27b1f5f29 100644 --- a/spacy/tests/regression/test_issue3839.py +++ b/spacy/tests/regression/test_issue3839.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index 62e8eabd6..0a851e869 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.attrs import IS_ALPHA from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py index 5cd245231..8500c09aa 100644 --- a/spacy/tests/regression/test_issue3879.py +++ b/spacy/tests/regression/test_issue3879.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index c060473f5..6e8ab6f43 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English import pytest diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py index 1b2dcea25..fa616db1d 100644 --- a/spacy/tests/regression/test_issue3882.py +++ b/spacy/tests/regression/test_issue3882.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.displacy import parse_deps from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py index 33230112f..6e4c9eeaa 100644 --- a/spacy/tests/regression/test_issue3951.py +++ b/spacy/tests/regression/test_issue3951.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py index c1f7fe100..7db28a31f 100644 --- a/spacy/tests/regression/test_issue3959.py +++ b/spacy/tests/regression/test_issue3959.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from ..util import make_tempdir diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py index ae60fa0fa..971c9b08e 100644 --- a/spacy/tests/regression/test_issue3962.py +++ b/spacy/tests/regression/test_issue3962.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from ..util import get_doc diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py index 22b8d486e..fe5388950 100644 --- a/spacy/tests/regression/test_issue3972.py +++ b/spacy/tests/regression/test_issue3972.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py index d075128aa..3ac26d3ab 100644 --- a/spacy/tests/regression/test_issue4002.py +++ b/spacy/tests/regression/test_issue4002.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index ed219573f..7158d9b21 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.util import minibatch, compounding @@ -41,13 +38,8 @@ def test_issue4030(): batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - docs=texts, - golds=annotations, - sgd=optimizer, - drop=0.1, - losses=losses, + examples=batch, sgd=optimizer, drop=0.1, losses=losses, ) # processing of an empty doc should result in 0.0 for all categories diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 00a8882d3..6644a8eda 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py index cc84cebf8..c52ded395 100644 --- a/spacy/tests/regression/test_issue4054.py +++ b/spacy/tests/regression/test_issue4054.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.vocab import Vocab import spacy from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py index d288f46c4..4849aa238 100644 --- a/spacy/tests/regression/test_issue4120.py +++ b/spacy/tests/regression/test_issue4120.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py index 93262f8cf..a726806d7 100644 --- a/spacy/tests/regression/test_issue4133.py +++ b/spacy/tests/regression/test_issue4133.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py index eb4eb8648..97d532d2a 100644 --- a/spacy/tests/regression/test_issue4190.py +++ b/spacy/tests/regression/test_issue4190.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokenizer import Tokenizer from spacy import util diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py index ef871bf9f..891f03b30 100644 --- a/spacy/tests/regression/test_issue4267.py +++ b/spacy/tests/regression/test_issue4267.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py index c57704d71..4bac97a44 100644 --- a/spacy/tests/regression/test_issue4272.py +++ b/spacy/tests/regression/test_issue4272.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.el import Greek diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py index cb09340ff..ffbc41226 100644 --- a/spacy/tests/regression/test_issue4278.py +++ b/spacy/tests/regression/test_issue4278.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.pipeline import Pipe diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index c68f745a7..a3f6f69df 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from collections import defaultdict from spacy.pipeline import EntityRecognizer diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index d2e27d563..4978e0c8e 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.util import minibatch, compounding import pytest @@ -21,5 +18,4 @@ def test_issue4348(): losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py index ab6192744..917847a05 100644 --- a/spacy/tests/regression/test_issue4367.py +++ b/spacy/tests/regression/test_issue4367.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import DocBin diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py index 57d7547da..dbde1624e 100644 --- a/spacy/tests/regression/test_issue4373.py +++ b/spacy/tests/regression/test_issue4373.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher, PhraseMatcher from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index d3b4bdf9a..80d37b1e6 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import srsly from spacy.gold import GoldCorpus from spacy.lang.en import English @@ -11,15 +8,14 @@ from ..util import make_tempdir def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: - print("temp", tmpdir) json_path = tmpdir / "test4402.json" srsly.write_json(json_path, json_data) corpus = GoldCorpus(str(json_path), str(json_path)) - train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0)) + train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) # assert that the data got split into 4 sentences - assert len(train_docs) == 4 + assert len(train_data) == 4 json_data = [ diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py index 460449003..6f96c9f2d 100644 --- a/spacy/tests/regression/test_issue4528.py +++ b/spacy/tests/regression/test_issue4528.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import Doc, DocBin diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py index 381957be6..fa962c053 100644 --- a/spacy/tests/regression/test_issue4529.py +++ b/spacy/tests/regression/test_issue4529.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.gold import GoldParse diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 8ec9a0bd1..74bb5de10 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from mock import Mock from spacy.matcher import DependencyMatcher from ..util import get_doc diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py index eb49f4a38..3f6c1a57c 100644 --- a/spacy/tests/regression/test_issue4651.py +++ b/spacy/tests/regression/test_issue4651.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 8fa4f9259..149e1431b 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.kb import KnowledgeBase from spacy.util import ensure_path diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py index e710881d7..d9798ef84 100644 --- a/spacy/tests/regression/test_issue4707.py +++ b/spacy/tests/regression/test_issue4707.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.util import load_model_from_path from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 834219773..ddbf6f7a0 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler @@ -9,11 +6,12 @@ def test_issue4849(): nlp = English() ruler = EntityRuler( - nlp, patterns=[ - {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, - {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'}, + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, ], - phrase_matcher_attr="LOWER" + phrase_matcher_attr="LOWER", ) nlp.add_pipe(ruler) @@ -27,10 +25,10 @@ def test_issue4849(): count_ents = 0 for doc in nlp.pipe([text], n_process=1): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert(count_ents == 2) + assert count_ents == 2 # USING 2 PROCESSES count_ents = 0 for doc in nlp.pipe([text], n_process=2): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert (count_ents == 2) + assert count_ents == 2 diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index 8aea2c3d5..5665d6d0f 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -1,16 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest - -import spacy +from spacy.language import Language -@pytest.fixture -def nlp(): - return spacy.blank("en") - - -def test_evaluate(nlp): +def test_evaluate(): + nlp = Language() docs_golds = [("", {})] - nlp.evaluate(docs_golds) + with pytest.raises(ValueError): + nlp.evaluate(docs_golds) diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index ef2b1ee89..615bb1cd9 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,13 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import spacy - import pytest - from spacy.lang.en import English from spacy.tokens import Doc, DocBin -from spacy.compat import path2str from ..util import make_tempdir @@ -43,7 +37,7 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: file_path = d / "doc" - file_path = path2str(file_path) + file_path = str(file_path) doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py index 45c2e3909..9cfa1a552 100644 --- a/spacy/tests/serialize/test_serialize_extension_attrs.py +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc, Token from spacy.vocab import Vocab @@ -10,9 +7,7 @@ from spacy.vocab import Vocab def doc_w_attrs(en_tokenizer): Doc.set_extension("_test_attr", default=False) Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text)) - Doc.set_extension( - "_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg) - ) + Doc.set_extension("_test_method", method=lambda doc, arg: f"{len(doc.text)}{arg}") doc = en_tokenizer("This is a test.") doc._._test_attr = "test" @@ -28,8 +23,7 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs): assert doc._.has("_test_attr") assert doc._._test_attr == "test" assert doc._._test_prop == len(doc.text) - assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test") - + assert doc._._test_method("test") == f"{len(doc.text)}test" assert doc[0]._._test_token == "t0" assert doc[1]._._test_token == "t1" assert doc[2]._._test_token == "t0" diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index b19c11864..91036a496 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.util import ensure_path from spacy.kb import KnowledgeBase diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index efc5d181c..4089a0d07 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.language import Language diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index efa7ef625..0ad9bc4d4 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer +from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer from ..util import make_tempdir @@ -144,3 +141,10 @@ def test_serialize_pipe_exclude(en_vocab, Parser): parser.to_bytes(cfg=False, exclude=["vocab"]) with pytest.raises(ValueError): get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False) + + +def test_serialize_sentencerecognizer(en_vocab): + sr = SentenceRecognizer(en_vocab) + sr_b = sr.to_bytes() + sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b) + assert sr.to_bytes() == sr_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 9a273980c..f504ed048 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class from spacy.tokenizer import Tokenizer diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 1671845ee..359a0657f 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.strings import StringStore diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py deleted file mode 100644 index d6bbab04e..000000000 --- a/spacy/tests/test_align.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy._align import align, multi_align - - -@pytest.mark.parametrize( - "string1,string2,cost", - [ - ("hello", "hell", 1), - ("rat", "cat", 1), - ("rat", "rat", 0), - ("rat", "catsie", 4), - ("t", "catsie", 5), - ], -) -def test_align_costs(string1, string2, cost): - output_cost, i2j, j2i, matrix = align(string1, string2) - assert output_cost == cost - - -@pytest.mark.parametrize( - "string1,string2,i2j", - [ - ("hello", "hell", [0, 1, 2, 3, -1]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2]), - ("t", "catsie", [2]), - ], -) -def test_align_i2j(string1, string2, i2j): - output_cost, output_i2j, j2i, matrix = align(string1, string2) - assert list(output_i2j) == i2j - - -@pytest.mark.parametrize( - "string1,string2,j2i", - [ - ("hello", "hell", [0, 1, 2, 3]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2, -1, -1, -1]), - ("t", "catsie", [-1, -1, 0, -1, -1, -1]), - ], -) -def test_align_i2j_2(string1, string2, j2i): - output_cost, output_i2j, output_j2i, matrix = align(string1, string2) - assert list(output_j2i) == j2i - - -def test_align_strings(): - words1 = ["hello", "this", "is", "test!"] - words2 = ["hellothis", "is", "test", "!"] - cost, i2j, j2i, matrix = align(words1, words2) - assert cost == 4 - assert list(i2j) == [-1, -1, 1, -1] - assert list(j2i) == [-1, 2, -1, -1] - - -def test_align_many_to_one(): - words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] - words2 = ["ab", "bc", "e", "fg", "h"] - cost, i2j, j2i, matrix = align(words1, words2) - assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] - lengths1 = [len(w) for w in words1] - lengths2 = [len(w) for w in words2] - i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2) - assert i2j_multi[0] == 0 - assert i2j_multi[1] == 0 - assert i2j_multi[2] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[5] == 3 - assert i2j_multi[6] == 3 - - assert j2i_multi[0] == 1 - assert j2i_multi[1] == 3 diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index 77f1af020..31b2a2d2f 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -1,15 +1,12 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy import registry -from thinc.v2v import Affine +from thinc.api import Linear from catalogue import RegistryError @registry.architectures.register("my_test_function") def create_model(nr_in, nr_out): - return Affine(nr_in, nr_out) + return Linear(nr_in, nr_out) def test_get_architecture(): diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 6dce649a9..306adc881 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English @@ -9,7 +6,7 @@ from spacy.cli.pretrain import make_docs def test_cli_converters_conllu2json(): - # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", @@ -32,6 +29,74 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] +def test_cli_converters_conllu2json_name_ner_map(): + lines = [ + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", + ] + input_data = "\n".join(lines) + converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) + assert len(converted) == 1 + assert converted[0]["id"] == 0 + assert len(converted[0]["paragraphs"]) == 1 + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." + assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 + sent = converted[0]["paragraphs"][0]["sentences"][0] + assert len(sent["tokens"]) == 5 + tokens = sent["tokens"] + assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] + assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] + assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] + assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] + assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] + + +def test_cli_converters_conllu2json_subtokens(): + # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + lines = [ + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER", + "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", + ] + input_data = "\n".join(lines) + converted = conllu2json( + input_data, n_sents=1, merge_subtokens=True, append_morphology=True + ) + assert len(converted) == 1 + assert converted[0]["id"] == 0 + assert len(converted[0]["paragraphs"]) == 1 + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." + assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 + sent = converted[0]["paragraphs"][0]["sentences"][0] + assert len(sent["tokens"]) == 4 + tokens = sent["tokens"] + print(tokens) + assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."] + assert [t["tag"] for t in tokens] == [ + "NOUN__Definite=Ind|Gender=Masc|Number=Sing", + "PROPN_X__Gender=Fem,Masc|Tense=past", + "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin", + "PUNCT", + ] + assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"] + assert [t["morph"] for t in tokens] == [ + "Definite=Ind|Gender=Masc|Number=Sing", + "Gender=Fem,Masc|Tense=past", + "Mood=Ind|Tense=Pres|VerbForm=Fin", + "", + ] + assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] + assert [t["head"] for t in tokens] == [1, 1, 0, -1] + assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] + assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] + + def test_cli_converters_iob2json(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -106,7 +171,6 @@ def test_cli_converters_conll_ner2json(): ] input_data = "\n".join(lines) converted = conll_ner2json(input_data, n_sents=10) - print(converted) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 2d1f1bd8f..4436b437f 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy import displacy from spacy.displacy.render import DependencyRenderer @@ -80,10 +77,10 @@ def test_displacy_rtl(): html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html assert 'direction="rtl"' in html - assert 'lang="{}"'.format(nlp.lang) in html + assert f'lang="{nlp.lang}"' in html html = displacy.render(doc, page=True, style="ent") assert "direction: rtl" in html - assert 'lang="{}"'.format(nlp.lang) in html + assert f'lang="{nlp.lang}"' in html def test_displacy_render_wrapper(en_vocab): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index fbdb3155b..7fe8aab73 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,16 +1,98 @@ -# coding: utf-8 -from __future__ import unicode_literals - +from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo -from spacy.gold import GoldCorpus, docs_to_json, align +from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align +from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation from spacy.lang.en import English +from spacy.syntax.nonproj import is_nonproj_tree from spacy.tokens import Doc +from spacy.util import compounding, minibatch from .util import make_tempdir import pytest import srsly +@pytest.fixture +def doc(): + text = "Sarah's sister flew to Silicon Valley via London." + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = [ + "PROPN", + "PART", + "NOUN", + "VERB", + "ADP", + "PROPN", + "PROPN", + "ADP", + "PROPN", + "PUNCT", + ] + morphs = [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + "", + "NounType=prop|Number=sing", + "NounType=prop|Number=sing", + "", + "NounType=prop|Number=sing", + "PunctType=peri", + ] + # head of '.' is intentionally nonprojective for testing + heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] + deps = [ + "poss", + "case", + "nsubj", + "ROOT", + "prep", + "compound", + "pobj", + "prep", + "pobj", + "punct", + ] + lemmas = [ + "Sarah", + "'s", + "sister", + "fly", + "to", + "Silicon", + "Valley", + "via", + "London", + ".", + ] + biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + nlp = English() + doc = nlp(text) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].pos_ = pos[i] + doc[i].morph_ = morphs[i] + doc[i].lemma_ = lemmas[i] + doc[i].dep_ = deps[i] + doc[i].head = doc[heads[i]] + doc.ents = spans_from_biluo_tags(doc, biluo_tags) + doc.cats = cats + doc.is_tagged = True + doc.is_parsed = True + return doc + + +@pytest.fixture() +def merged_dict(): + return { + "ids": [1, 2, 3, 4, 5, 6, 7], + "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], + "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], + "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], + } + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] @@ -97,35 +179,35 @@ def test_iob_to_biluo(): iob_to_biluo(bad_iob) -def test_roundtrip_docs_to_json(): - text = "I flew to Silicon Valley via London." - tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] - heads = [1, 1, 1, 4, 2, 1, 5, 1] - deps = ["nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] - biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] - cats = {"TRAVEL": 1.0, "BAKING": 0.0} +def test_roundtrip_docs_to_json(doc): nlp = English() - doc = nlp(text) - for i in range(len(tags)): - doc[i].tag_ = tags[i] - doc[i].dep_ = deps[i] - doc[i].head = doc[heads[i]] - doc.ents = spans_from_biluo_tags(doc, biluo_tags) - doc.cats = cats - doc.is_tagged = True - doc.is_parsed = True + text = doc.text + tags = [t.tag_ for t in doc] + pos = [t.pos_ for t in doc] + morphs = [t.morph_ for t in doc] + lemmas = [t.lemma_ for t in doc] + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + biluo_tags = iob_to_biluo( + [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc] + ) + cats = doc.cats # roundtrip to JSON with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) - reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + reloaded_example = next(goldcorpus.dev_dataset(nlp)) + goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() - assert text == reloaded_doc.text + assert text == reloaded_example.text assert tags == goldparse.tags + assert pos == goldparse.pos + assert morphs == goldparse.morphs + assert lemmas == goldparse.lemmas assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner @@ -140,11 +222,15 @@ def test_roundtrip_docs_to_json(): srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + reloaded_example = next(goldcorpus.dev_dataset(nlp)) + goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() - assert text == reloaded_doc.text + assert text == reloaded_example.text assert tags == goldparse.tags + assert pos == goldparse.pos + assert morphs == goldparse.morphs + assert lemmas == goldparse.lemmas assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner @@ -160,16 +246,18 @@ def test_roundtrip_docs_to_json(): srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) # load and rewrite as JSONL tuples - srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples) + srsly.write_jsonl(jsonl_file, goldcorpus.train_examples) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + reloaded_example = next(goldcorpus.dev_dataset(nlp)) + goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() - assert text == reloaded_doc.text + assert text == reloaded_example.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads + assert lemmas == goldparse.lemmas assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats @@ -177,13 +265,81 @@ def test_roundtrip_docs_to_json(): assert cats["BAKING"] == goldparse.cats["BAKING"] -@pytest.mark.skip(reason="skip while we have backwards-compatible alignment") +def test_projective_train_vs_nonprojective_dev(doc): + nlp = English() + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + train_goldparse = train_reloaded_example.gold + + dev_reloaded_example = next(goldcorpus.dev_dataset(nlp)) + dev_goldparse = dev_reloaded_example.gold + + assert is_nonproj_tree([t.head.i for t in doc]) is True + assert is_nonproj_tree(train_goldparse.heads) is False + assert heads[:-1] == train_goldparse.heads[:-1] + assert heads[-1] != train_goldparse.heads[-1] + assert deps[:-1] == train_goldparse.labels[:-1] + assert deps[-1] != train_goldparse.labels[-1] + + assert heads == dev_goldparse.heads + assert deps == dev_goldparse.labels + + +def test_ignore_misaligned(doc): + nlp = English() + text = doc.text + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + data = [docs_to_json(doc)] + data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, data) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + with pytest.raises(AlignmentError): + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + data = [docs_to_json(doc)] + data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, data) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + # doesn't raise an AlignmentError, but there is nothing to iterate over + # because the only example can't be aligned + train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) + assert len(train_reloaded_example) == 0 + + +def test_make_orth_variants(doc): + nlp = English() + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + # due to randomness, test only that this runs with no errors for now + train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) + train_goldparse = train_reloaded_example.gold # noqa: F841 + + @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), ( - ["a", "b", "``", "c"], + ["a", "b", '"', "c"], ['ab"', "c"], (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ), @@ -216,3 +372,111 @@ def test_goldparse_startswith_space(en_tokenizer): assert g.words == [" ", "a"] assert g.ner == [None, "U-DATE"] assert g.labels == [None, "ROOT"] + + +def test_gold_constructor(): + """Test that the GoldParse constructor works fine""" + nlp = English() + doc = nlp("This is a sentence") + gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) + + assert gold.cats["cat1"] + assert not gold.cats["cat2"] + assert gold.words == ["This", "is", "a", "sentence"] + + +def test_gold_orig_annot(): + nlp = English() + doc = nlp("This is a sentence") + gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) + + assert gold.orig.words == ["This", "is", "a", "sentence"] + assert gold.cats["cat1"] + + doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0}) + gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig) + assert gold2.orig.words == ["This", "is", "a", "sentence"] + assert not gold2.cats["cat1"] + + +def test_tuple_format_implicit(): + """Test tuple format with implicit GoldParse creation""" + + train_data = [ + ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), + ( + "Spotify steps up Asia expansion", + {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]}, + ), + ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}), + ] + + _train(train_data) + + +def test_tuple_format_implicit_invalid(): + """Test that an error is thrown for an implicit invalid GoldParse field""" + + train_data = [ + ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}), + ( + "Spotify steps up Asia expansion", + {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]}, + ), + ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}), + ] + + with pytest.raises(TypeError): + _train(train_data) + + +def _train(train_data): + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("ORG") + ner.add_label("LOC") + nlp.add_pipe(ner) + + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + +def test_split_sents(merged_dict): + nlp = English() + example = Example() + example.set_token_annotation(**merged_dict) + assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 + assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 + + split_examples = example.split_sents() + assert len(split_examples) == 2 + + token_annotation_1 = split_examples[0].token_annotation + assert token_annotation_1.ids == [1, 2, 3] + assert token_annotation_1.words == ["Hi", "there", "everyone"] + assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] + assert token_annotation_1.sent_starts == [1, 0, 0] + + token_annotation_2 = split_examples[1].token_annotation + assert token_annotation_2.ids == [4, 5, 6, 7] + assert token_annotation_2.words == ["It", "is", "just", "me"] + assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2.sent_starts == [1, 0, 0, 0] + + +def test_tuples_to_example(merged_dict): + ex = Example() + ex.set_token_annotation(**merged_dict) + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + ex.set_doc_annotation(cats=cats) + ex_dict = ex.to_dict() + + assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] + assert ex_dict["token_annotation"]["words"] == merged_dict["words"] + assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] + assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] + assert ex_dict["doc_annotation"]["cats"] == cats diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py deleted file mode 100644 index 89e797c1a..000000000 --- a/spacy/tests/test_json_schemas.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.util import get_json_validator, validate_json, validate_schema -from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA -from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA -import pytest - - -@pytest.fixture(scope="session") -def training_schema_validator(): - return get_json_validator(TRAINING_SCHEMA) - - -def test_validate_schema(): - validate_schema({"type": "object"}) - with pytest.raises(Exception): - validate_schema({"type": lambda x: x}) - - -@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA, TOKEN_PATTERN_SCHEMA]) -def test_schemas(schema): - validate_schema(schema) - - -@pytest.mark.parametrize( - "data", - [ - {"text": "Hello world"}, - {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, - ], -) -def test_json_schema_training_valid(data, training_schema_validator): - errors = validate_json([data], training_schema_validator) - assert not errors - - -@pytest.mark.parametrize( - "data,n_errors", - [ - ({"spans": []}, 1), - ({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2), - ({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1), - ({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1), - ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), - ], -) -def test_json_schema_training_invalid(data, n_errors, training_schema_validator): - errors = validate_json([data], training_schema_validator) - assert len(errors) == n_errors diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 7106cef74..58db0a040 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,10 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools - import pytest -from spacy.compat import is_python2 from spacy.gold import GoldParse from spacy.language import Language from spacy.tokens import Doc, Span @@ -31,20 +26,20 @@ def test_language_update(nlp): doc = Doc(nlp.vocab, words=text.split(" ")) gold = GoldParse(doc, **annots) # Update with doc and gold objects - nlp.update([doc], [gold]) + nlp.update((doc, gold)) # Update with text and dict - nlp.update([text], [annots]) + nlp.update((text, annots)) # Update with doc object and dict - nlp.update([doc], [annots]) + nlp.update((doc, annots)) # Update with text and gold object - nlp.update([text], [gold]) + nlp.update((text, gold)) + # Update with empty doc and gold object + nlp.update((None, gold)) # Update badly - with pytest.raises(IndexError): - nlp.update([doc], []) - with pytest.raises(IndexError): - nlp.update([], [gold]) with pytest.raises(ValueError): - nlp.update([text], [wrongkeyannots]) + nlp.update((doc, None)) + with pytest.raises(TypeError): + nlp.update((text, wrongkeyannots)) def test_language_evaluate(nlp): @@ -134,9 +129,6 @@ def test_language_pipe(nlp2, n_process, texts): assert_docs_equal(doc, expected_doc) -@pytest.mark.skipif( - is_python2, reason="python2 seems to be unable to handle iterator properly" -) @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_stream(nlp2, n_process, texts): # check if nlp.pipe can handle infinite length iterator properly. diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 701222afc..c2534ca22 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.language import Language diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4075ccf64..6d4e75a31 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,41 +1,11 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import os import ctypes from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.compat import symlink_to, symlink_remove, path2str, is_windows -from spacy._ml import PrecomputableAffine -from subprocess import CalledProcessError - - -@pytest.fixture -def symlink_target(): - return Path("./foo-target") - - -@pytest.fixture -def symlink(): - return Path("./foo-symlink") - - -@pytest.fixture(scope="function") -def symlink_setup_target(request, symlink_target, symlink): - if not symlink_target.exists(): - os.mkdir(path2str(symlink_target)) - # yield -- need to cleanup even if assertion fails - # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 - - def cleanup(): - # Remove symlink only if it was created - if symlink.exists(): - symlink_remove(symlink) - os.rmdir(path2str(symlink_target)) - - request.addfinalizer(cleanup) +from spacy.ml._layers import PrecomputableAffine +from spacy.ml._layers import _backprop_precomputable_affine_padding @pytest.fixture @@ -70,28 +40,30 @@ def test_util_get_package_path(package): def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP) - assert model.W.shape == (nF, nO, nP, nI) - tensor = model.ops.allocate((10, nI)) + assert model.get_param("W").shape == (nF, nO, nP, nI) + tensor = model.ops.alloc((10, nI)) Y, get_dX = model.begin_update(tensor) assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP) - assert model.d_pad.shape == (1, nF, nO, nP) - dY = model.ops.allocate((15, nO, nP)) - ids = model.ops.allocate((15, nF)) + dY = model.ops.alloc((15, nO, nP)) + ids = model.ops.alloc((15, nF)) ids[1, 2] = -1 dY[1] = 1 - assert model.d_pad[0, 2, 0, 0] == 0.0 - model._backprop_padding(dY, ids) - assert model.d_pad[0, 2, 0, 0] == 1.0 - model.d_pad.fill(0.0) + assert not model.has_grad("pad") + d_pad = _backprop_precomputable_affine_padding(model, dY, ids) + assert d_pad[0, 2, 0, 0] == 1.0 ids.fill(0.0) dY.fill(0.0) - ids[1, 2] = -1 + dY[0] = 0 + ids[1, 2] = 0 ids[1, 1] = -1 ids[1, 0] = -1 dY[1] = 1 - assert model.d_pad[0, 2, 0, 0] == 0.0 - model._backprop_padding(dY, ids) - assert model.d_pad[0, 2, 0, 0] == 3.0 + ids[2, 0] = -1 + dY[2] = 5 + d_pad = _backprop_precomputable_affine_padding(model, dY, ids) + assert d_pad[0, 0, 0, 0] == 6 + assert d_pad[0, 1, 0, 0] == 1 + assert d_pad[0, 2, 0, 0] == 0 def test_prefer_gpu(): @@ -109,25 +81,6 @@ def test_require_gpu(): require_gpu() -def test_create_symlink_windows( - symlink_setup_target, symlink_target, symlink, is_admin -): - """Test the creation of symlinks on windows. If run as admin or not on windows it should succeed, otherwise a CalledProcessError should be raised.""" - assert symlink_target.exists() - - if is_admin or not is_windows: - try: - symlink_to(symlink, symlink_target) - assert symlink.exists() - except CalledProcessError as e: - pytest.fail(e) - else: - with pytest.raises(CalledProcessError): - symlink_to(symlink, symlink_target) - - assert not symlink.exists() - - def test_ascii_filenames(): """Test that all filenames in the project are ASCII. See: https://twitter.com/_inesmontani/status/1177941471632211968 diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 65288527a..e4c67b672 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy import srsly diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 2a4ef0f40..efaf80b4f 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,10 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import GoldParse +from spacy.gold import Example, GoldParse from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc @@ -54,7 +51,7 @@ def test_las_per_type(en_vocab): deps=annot["deps"], ) gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) - scorer.score(doc, gold) + scorer.score((doc, gold)) results = scorer.scores assert results["uas"] == 100 @@ -77,7 +74,7 @@ def test_las_per_type(en_vocab): ) gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) doc[0].dep_ = "compound" - scorer.score(doc, gold) + scorer.score((doc, gold)) results = scorer.scores assert results["uas"] == 100 @@ -99,8 +96,9 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) - gold = GoldParse(doc, entities=annot["entities"]) - scorer.score(doc, gold) + ex = Example(doc=doc) + ex.set_token_annotation(entities=annot["entities"]) + scorer.score(ex) results = scorer.scores assert results["ents_p"] == 100 @@ -119,8 +117,9 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) - gold = GoldParse(doc, entities=annot["entities"]) - scorer.score(doc, gold) + ex = Example(doc=doc) + ex.set_token_annotation(entities=annot["entities"]) + scorer.score(ex) results = scorer.scores assert results["ents_p"] == approx(66.66666) diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index ddaa71059..2d10d79d4 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,12 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest -from spacy._ml import Tok2Vec +from spacy.ml.component_models import Tok2Vec from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.compat import unicode_ def get_batch(batch_size): @@ -14,9 +10,9 @@ def get_batch(batch_size): docs = [] start = 0 for size in range(1, batch_size + 1): - # Make the words numbers, so that they're distnct + # Make the words numbers, so that they're distinct # across the batch, and easy to track. - numbers = [unicode_(i) for i in range(start, start + size)] + numbers = [str(i) for i in range(start, start + size)] docs.append(Doc(vocab, words=numbers)) start += size return docs @@ -41,6 +37,7 @@ def test_empty_doc(): def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) tok2vec = Tok2Vec(width, embed_size) + tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) assert len(vectors) == len(batch) for doc_vec, doc in zip(vectors, batch): @@ -60,6 +57,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) tok2vec = Tok2Vec(**tok2vec_config) + tok2vec.initialize() vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"]) diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index a79363abb..9a98e049e 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,13 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - import sys import pytest def test_tokenizer_handles_emoticons(tokenizer): # Tweebo challenge (CMU) - text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + text = ( + """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ....""" + ) tokens = tokenizer(text) assert tokens[0].text == ":o" assert tokens[1].text == ":/" @@ -28,12 +27,11 @@ def test_tokenizer_handles_emoticons(tokenizer): assert tokens[16].text == ">:(" assert tokens[17].text == ":D" assert tokens[18].text == "=|" - assert tokens[19].text == '")' - assert tokens[20].text == ":>" - assert tokens[21].text == "...." + assert tokens[19].text == ":>" + assert tokens[20].text == "...." -@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)]) +@pytest.mark.parametrize("text,length", [("108)", 2), ("XDN", 1)]) def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 2d71588cc..3e7681234 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class @@ -58,7 +55,7 @@ LANGUAGES = [ @pytest.mark.parametrize("lang", LANGUAGES) def test_tokenizer_explain(lang): tokenizer = get_lang_class(lang).Defaults.create_tokenizer() - examples = pytest.importorskip("spacy.lang.{}.examples".format(lang)) + examples = pytest.importorskip(f"spacy.lang.{lang}.examples") for sentence in examples.sentences: tokens = [t.text for t in tokenizer(sentence) if not t.is_space] debug_tokens = [t[1] for t in tokenizer.explain(sentence)] diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py index 36c69611e..e93d5654f 100644 --- a/spacy/tests/tokenizer/test_naughty_strings.py +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest # Examples taken from the "Big List of Naughty Strings" diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 803c31abf..c035559b4 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokenizer import Tokenizer @@ -108,6 +105,12 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens): assert doc[1].text == tokens[1]["orth"] +@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])]) +def test_tokenizer_validate_special_case(tokenizer, text, tokens): + with pytest.raises(ValueError): + tokenizer.add_special_case(text, tokens) + + @pytest.mark.parametrize( "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])] ) @@ -120,3 +123,30 @@ def test_tokenizer_add_special_case_tag(text, tokens): assert doc[0].tag_ == tokens[0]["tag"] assert doc[0].pos_ == "NOUN" assert doc[1].text == tokens[1]["orth"] + + +def test_tokenizer_special_cases_with_affixes(tokenizer): + text = '(((_SPECIAL_ A/B, A/B-A/B")' + tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) + tokenizer.add_special_case("A/B", [{"orth": "A/B"}]) + doc = tokenizer(text) + assert [token.text for token in doc] == [ + "(", + "(", + "(", + "_SPECIAL_", + "A/B", + ",", + "A/B", + "-", + "A/B", + '"', + ")", + ] + + +def test_tokenizer_special_cases_with_period(tokenizer): + text = "_SPECIAL_." + tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) + doc = tokenizer(text) + assert [token.text for token in doc] == ["_SPECIAL_", "."] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 58e9d73f3..87211ab95 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -1,8 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest +from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS + URLS_BASIC = [ "http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", @@ -196,7 +195,12 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): @pytest.mark.parametrize("url", URLS_FULL) def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url): tokens = tokenizer(url + suffix1 + suffix2) - assert len(tokens) == 3 - assert tokens[0].text == url - assert tokens[1].text == suffix1 - assert tokens[2].text == suffix2 + if suffix1 + suffix2 in BASE_EXCEPTIONS: + assert len(tokens) == 2 + assert tokens[0].text == url + assert tokens[1].text == suffix1 + suffix2 + else: + assert len(tokens) == 3 + assert tokens[0].text == url + assert tokens[1].text == suffix1 + assert tokens[2].text == suffix2 diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 74c9b369b..c7b9d7c6d 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 175480fe7..96ee9a3de 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import numpy import tempfile import shutil @@ -9,7 +6,6 @@ import srsly from pathlib import Path from spacy.tokens import Doc, Span from spacy.attrs import POS, HEAD, DEP -from spacy.compat import path2str @contextlib.contextmanager @@ -23,7 +19,7 @@ def make_tempfile(mode="r"): def make_tempdir(): d = Path(tempfile.mkdtemp()) yield d - shutil.rmtree(path2str(d)) + shutil.rmtree(str(d)) def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): @@ -95,7 +91,11 @@ def assert_docs_equal(doc1, doc2): assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2] assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2] - assert [ent for ent in doc1.ents] == [ent for ent in doc2.ents] + for ent1, ent2 in zip(doc1.ents, doc2.ents): + assert ent1.start == ent2.start + assert ent1.end == ent2.end + assert ent1.label == ent2.label + assert ent1.kb_id == ent2.kb_id def assert_packed_msg_equal(b1, b2): diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d84a56981..e033aa7c6 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index f78dd33c4..fff3d24ef 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lookups import Lookups, Table from spacy.strings import get_string_id diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index f98f0e6e0..b5f7303b5 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from spacy.tokens import Doc diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index 75b1116dd..c71d5f3f2 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.strings import StringStore diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index b688ab9dd..011cd16b1 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,17 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from numpy.testing import assert_allclose -from spacy._ml import cosine from spacy.vocab import Vocab from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc -from ..util import add_vecs_to_vocab +from ..util import add_vecs_to_vocab, get_cosine @pytest.fixture @@ -314,4 +310,4 @@ def test_vocab_prune_vectors(): assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap - assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) + assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index d22db2d8b..a687059be 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..ba22f7782 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,10 +4,11 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC +from .structs cimport LexemeC, SpanC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, LexemesOrTokens, _Cached +from .matcher.phrasematcher cimport PhraseMatcher cdef class Tokenizer: @@ -21,15 +22,32 @@ cdef class Tokenizer: cdef object _suffix_search cdef object _infix_finditer cdef object _rules + cdef PhraseMatcher _special_matcher + cdef int _property_init_count + cdef int _property_init_max cpdef Doc tokens_from_list(self, list strings) + cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) + cdef int _apply_special_cases(self, Doc doc) except -1 + cdef void _filter_special_spans(self, vector[SpanC] &original, + vector[SpanC] &filtered, int doc_len) nogil + cdef object _prepare_special_spans(self, Doc doc, + vector[SpanC] &filtered) + cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, + object span_data) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 - cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 - cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes, int* has_special) + cdef int _try_specials(self, hash_t key, Doc tokens, + int* has_special) except -1 + cdef int _tokenize(self, Doc tokens, unicode span, hash_t key, + int* has_special, bint with_special_cases) except -1 + cdef unicode _split_affixes(self, Pool mem, unicode string, + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes, int* has_special, + bint with_special_cases) cdef int _attach_tokens(self, Doc tokens, unicode string, - vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 - - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, - int n) except -1 + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes, int* has_special, + bint with_special_cases) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, + int* has_special, int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 230f41921..25d9f239d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,25 +1,27 @@ # cython: embedsignature=True # cython: profile=True -# coding: utf8 from __future__ import unicode_literals from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc +from libc.string cimport memcpy, memset +from libcpp.set cimport set as stdset from cymem.cymem cimport Pool from preshed.maps cimport PreshMap cimport cython -from collections import OrderedDict import re from .tokens.doc cimport Doc from .strings cimport hash_string -from .compat import unescape_unicode from .attrs import intify_attrs from .symbols import ORTH from .errors import Errors, Warnings, deprecation_warning from . import util +from .attrs import intify_attrs +from .lexeme cimport EMPTY_LEXEME +from .symbols import ORTH cdef class Tokenizer: @@ -59,7 +61,10 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - self._load_special_tokenization(rules) + self._special_matcher = PhraseMatcher(self.vocab) + self._load_special_cases(rules) + self._property_init_count = 0 + self._property_init_max = 4 property token_match: def __get__(self): @@ -67,7 +72,9 @@ cdef class Tokenizer: def __set__(self, token_match): self._token_match = token_match - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property prefix_search: def __get__(self): @@ -75,7 +82,9 @@ cdef class Tokenizer: def __set__(self, prefix_search): self._prefix_search = prefix_search - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property suffix_search: def __get__(self): @@ -83,7 +92,9 @@ cdef class Tokenizer: def __set__(self, suffix_search): self._suffix_search = suffix_search - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property infix_finditer: def __get__(self): @@ -91,7 +102,9 @@ cdef class Tokenizer: def __set__(self, infix_finditer): self._infix_finditer = infix_finditer - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property rules: def __get__(self): @@ -100,10 +113,10 @@ cdef class Tokenizer: def __set__(self, rules): self._rules = {} self._reset_cache([key for key in self._cache]) - self._reset_specials() + self._flush_specials() self._cache = PreshMap() self._specials = PreshMap() - self._load_special_tokenization(rules) + self._load_special_cases(rules) def __reduce__(self): args = (self.vocab, @@ -118,7 +131,6 @@ cdef class Tokenizer: deprecation_warning(Warnings.W002) return Doc(self.vocab, words=strings) - @cython.boundscheck(False) def __call__(self, unicode string): """Tokenize a string. @@ -127,6 +139,17 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#call """ + doc = self._tokenize_affixes(string, True) + self._apply_special_cases(doc) + return doc + + @cython.boundscheck(False) + cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): + """Tokenize according to affix and token_match settings. + + string (unicode): The string to tokenize. + RETURNS (Doc): A container for linguistic annotations. + """ if len(string) >= (2 ** 30): raise ValueError(Errors.E025.format(length=len(string))) cdef int length = len(string) @@ -135,7 +158,9 @@ cdef class Tokenizer: return doc cdef int i = 0 cdef int start = 0 - cdef bint cache_hit + cdef int has_special = 0 + cdef bint specials_hit = 0 + cdef bint cache_hit = 0 cdef bint in_ws = string[0].isspace() cdef unicode span # The task here is much like string.split, but not quite @@ -151,9 +176,14 @@ cdef class Tokenizer: # we don't have to create the slice when we hit the cache. span = string[start:i] key = hash_string(span) - cache_hit = self._try_cache(key, doc) - if not cache_hit: - self._tokenize(doc, span, key) + specials_hit = 0 + cache_hit = 0 + if with_special_cases: + specials_hit = self._try_specials(key, doc, &has_special) + if not specials_hit: + cache_hit = self._try_cache(key, doc) + if not specials_hit and not cache_hit: + self._tokenize(doc, span, key, &has_special, with_special_cases) if uc == ' ': doc.c[doc.length - 1].spacy = True start = i + 1 @@ -164,13 +194,18 @@ cdef class Tokenizer: if start < i: span = string[start:] key = hash_string(span) - cache_hit = self._try_cache(key, doc) - if not cache_hit: - self._tokenize(doc, span, key) + specials_hit = 0 + cache_hit = 0 + if with_special_cases: + specials_hit = self._try_specials(key, doc, &has_special) + if not specials_hit: + cache_hit = self._try_cache(key, doc) + if not specials_hit and not cache_hit: + self._tokenize(doc, span, key, &has_special, with_special_cases) doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws return doc - def pipe(self, texts, batch_size=1000, n_threads=-1): + def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False): """Tokenize a stream of texts. texts: A sequence of unicode texts. @@ -186,23 +221,141 @@ cdef class Tokenizer: yield self(text) def _flush_cache(self): - self._reset_cache([key for key in self._cache if not key in self._specials]) + self._reset_cache([key for key in self._cache]) def _reset_cache(self, keys): for k in keys: + cached = <_Cached*>self._cache.get(k) del self._cache[k] - if not k in self._specials: - cached = <_Cached*>self._cache.get(k) - if cached is not NULL: - self.mem.free(cached) + if cached is not NULL: + self.mem.free(cached) - def _reset_specials(self): + def _flush_specials(self): for k in self._specials: cached = <_Cached*>self._specials.get(k) del self._specials[k] if cached is not NULL: self.mem.free(cached) + cdef int _apply_special_cases(self, Doc doc) except -1: + """Retokenize doc according to special cases. + + doc (Doc): Document. + """ + cdef int i + cdef int max_length = 0 + cdef bint modify_in_place + cdef Pool mem = Pool() + cdef vector[SpanC] c_matches + cdef vector[SpanC] c_filtered + cdef int offset + cdef int modified_doc_length + # Find matches for special cases + self._special_matcher.find_matches(doc, &c_matches) + # Skip processing if no matches + if c_matches.size() == 0: + return True + self._filter_special_spans(c_matches, c_filtered, doc.length) + # Put span info in span.start-indexed dict and calculate maximum + # intermediate document size + (span_data, max_length, modify_in_place) = self._prepare_special_spans(doc, c_filtered) + # If modifications never increase doc length, can modify in place + if modify_in_place: + tokens = doc.c + # Otherwise create a separate array to store modified tokens + else: + tokens = mem.alloc(max_length, sizeof(TokenC)) + # Modify tokenization according to filtered special cases + offset = self._retokenize_special_spans(doc, tokens, span_data) + # Allocate more memory for doc if needed + modified_doc_length = doc.length + offset + while modified_doc_length >= doc.max_length: + doc._realloc(doc.max_length * 2) + # If not modified in place, copy tokens back to doc + if not modify_in_place: + memcpy(doc.c, tokens, max_length * sizeof(TokenC)) + for i in range(doc.length + offset, doc.length): + memset(&doc.c[i], 0, sizeof(TokenC)) + doc.c[i].lex = &EMPTY_LEXEME + doc.length = doc.length + offset + return True + + cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil: + + cdef int seen_i + cdef SpanC span + cdef stdset[int] seen_tokens + stdsort(original.begin(), original.end(), len_start_cmp) + cdef int orig_i = original.size() - 1 + while orig_i >= 0: + span = original[orig_i] + if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1): + filtered.push_back(span) + for seen_i in range(span.start, span.end): + seen_tokens.insert(seen_i) + orig_i -= 1 + stdsort(filtered.begin(), filtered.end(), start_cmp) + + cdef object _prepare_special_spans(self, Doc doc, vector[SpanC] &filtered): + spans = [doc[match.start:match.end] for match in filtered] + cdef bint modify_in_place = True + cdef int curr_length = doc.length + cdef int max_length + cdef int span_length_diff = 0 + span_data = {} + for span in spans: + rule = self._rules.get(span.text, None) + span_length_diff = 0 + if rule: + span_length_diff = len(rule) - (span.end - span.start) + if span_length_diff > 0: + modify_in_place = False + curr_length += span_length_diff + if curr_length > max_length: + max_length = curr_length + span_data[span.start] = (span.text, span.start, span.end, span_length_diff) + return (span_data, max_length, modify_in_place) + + cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, object span_data): + cdef int i = 0 + cdef int j = 0 + cdef int offset = 0 + cdef _Cached* cached + cdef int idx_offset = 0 + cdef int orig_final_spacy + cdef int orig_idx + cdef int span_start + cdef int span_end + while i < doc.length: + if not i in span_data: + tokens[i + offset] = doc.c[i] + i += 1 + else: + span = span_data[i] + span_start = span[1] + span_end = span[2] + cached = <_Cached*>self._specials.get(hash_string(span[0])) + if cached == NULL: + # Copy original tokens if no rule found + for j in range(span_end - span_start): + tokens[i + offset + j] = doc.c[i + j] + i += span_end - span_start + else: + # Copy special case tokens into doc and adjust token and + # character offsets + idx_offset = 0 + orig_final_spacy = doc.c[span_end + offset - 1].spacy + orig_idx = doc.c[i].idx + for j in range(cached.length): + tokens[i + offset + j] = cached.data.tokens[j] + tokens[i + offset + j].idx = orig_idx + idx_offset + idx_offset += cached.data.tokens[j].lex.length + \ + 1 if cached.data.tokens[j].spacy else 0 + tokens[i + offset + cached.length - 1].spacy = orig_final_spacy + i += span_end - span_start + offset += span[3] + return offset + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: @@ -216,22 +369,33 @@ cdef class Tokenizer: tokens.push_back(&cached.data.tokens[i], False) return True - cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1: + cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1: + cached = <_Cached*>self._specials.get(key) + if cached == NULL: + return False + cdef int i + for i in range(cached.length): + tokens.push_back(&cached.data.tokens[i], False) + has_special[0] = 1 + return True + + cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size - cdef int has_special = 0 orig_size = tokens.length span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, - &has_special) - self._attach_tokens(tokens, span, &prefixes, &suffixes) + has_special, with_special_cases) + self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, + with_special_cases) self._save_cached(&tokens.c[orig_size], orig_key, has_special, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, - int* has_special): + int* has_special, + bint with_special_cases): cdef size_t i cdef unicode prefix cdef unicode suffix @@ -239,29 +403,28 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: - if self._specials.get(hash_string(string)) != NULL: - has_special[0] = 1 + if self.token_match and self.token_match(string) \ + and not self.find_prefix(string) \ + and not self.find_suffix(string): + break + if with_special_cases and self._specials.get(hash_string(string)) != NULL: break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: prefix = string[:pre_len] minus_pre = string[pre_len:] - # Check whether we've hit a special-case - if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: + if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) - has_special[0] = 1 break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] minus_suf = string[:-suf_len] - # Check whether we've hit a special-case - if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): + if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) - has_special[0] = 1 break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] @@ -273,15 +436,15 @@ cdef class Tokenizer: elif suf_len: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) - if string and (self._specials.get(hash_string(string)) != NULL): - has_special[0] = 1 - break return string cdef int _attach_tokens(self, Doc tokens, unicode string, vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes) except -1: - cdef bint cache_hit + vector[const LexemeC*] *suffixes, + int* has_special, + bint with_special_cases) except -1: + cdef bint specials_hit = 0 + cdef bint cache_hit = 0 cdef int split, end cdef const LexemeC* const* lexemes cdef const LexemeC* lexeme @@ -291,8 +454,12 @@ cdef class Tokenizer: for i in range(prefixes.size()): tokens.push_back(prefixes[0][i], False) if string: - cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: + if with_special_cases: + specials_hit = self._try_specials(hash_string(string), tokens, + has_special) + if not specials_hit: + cache_hit = self._try_cache(hash_string(string), tokens) + if specials_hit or cache_hit: pass elif self.token_match and self.token_match(string): # We're always saying 'no' to spaces here -- the caller will @@ -337,7 +504,7 @@ cdef class Tokenizer: tokens.push_back(lexeme, False) cdef int _save_cached(self, const TokenC* tokens, hash_t key, - int has_special, int n) except -1: + int* has_special, int n) except -1: cdef int i if n <= 0: # avoid mem alloc of zero length @@ -346,7 +513,7 @@ cdef class Tokenizer: if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 # See #1250 - if has_special: + if has_special[0]: return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n @@ -399,12 +566,25 @@ cdef class Tokenizer: match = self.suffix_search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, special_cases): + def _load_special_cases(self, special_cases): """Add special-case tokenization rules.""" if special_cases is not None: for chunk, substrings in sorted(special_cases.items()): + self._validate_special_case(chunk, substrings) self.add_special_case(chunk, substrings) + def _validate_special_case(self, chunk, substrings): + """Check whether the `ORTH` fields match the string. + + string (unicode): The string to specially tokenize. + substrings (iterable): A sequence of dicts, where each dict describes + a token and its attributes. + """ + attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings] + orth = "".join([spec[ORTH] for spec in attrs]) + if chunk != orth: + raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings)) + def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. @@ -415,6 +595,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#add_special_case """ + self._validate_special_case(string, substrings) substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) @@ -422,15 +603,25 @@ cdef class Tokenizer: cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) stale_special = <_Cached*>self._specials.get(key) - stale_cached = <_Cached*>self._cache.get(key) - self._flush_cache() self._specials.set(key, cached) - self._cache.set(key, cached) if stale_special is not NULL: self.mem.free(stale_special) - if stale_special != stale_cached and stale_cached is not NULL: - self.mem.free(stale_cached) self._rules[string] = substrings + self._flush_cache() + if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): + self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) + + def _reload_special_cases(self): + try: + self._property_init_count + except AttributeError: + return + # only reload if all 4 of prefix, suffix, infix, token_match have + # have been initialized + if self.vocab is not None and self._property_init_count >= self._property_init_max: + self._flush_cache() + self._flush_specials() + self._load_special_cases(self._rules) def explain(self, text): """A debugging tokenizer that provides information about which @@ -534,14 +725,14 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#to_bytes """ - serializers = OrderedDict(( - ("vocab", lambda: self.vocab.to_bytes()), - ("prefix_search", lambda: _get_regex_pattern(self.prefix_search)), - ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), - ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), - ("token_match", lambda: _get_regex_pattern(self.token_match)), - ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) - )) + serializers = { + "vocab": lambda: self.vocab.to_bytes(), + "prefix_search": lambda: _get_regex_pattern(self.prefix_search), + "suffix_search": lambda: _get_regex_pattern(self.suffix_search), + "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer), + "token_match": lambda: _get_regex_pattern(self.token_match), + "exceptions": lambda: dict(sorted(self._rules.items())) + } exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) @@ -554,20 +745,17 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#from_bytes """ - data = OrderedDict() - deserializers = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("prefix_search", lambda b: data.setdefault("prefix_search", b)), - ("suffix_search", lambda b: data.setdefault("suffix_search", b)), - ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), - ("token_match", lambda b: data.setdefault("token_match", b)), - ("exceptions", lambda b: data.setdefault("rules", b)) - )) + data = {} + deserializers = { + "vocab": lambda b: self.vocab.from_bytes(b), + "prefix_search": lambda b: data.setdefault("prefix_search", b), + "suffix_search": lambda b: data.setdefault("suffix_search", b), + "infix_finditer": lambda b: data.setdefault("infix_finditer", b), + "token_match": lambda b: data.setdefault("token_match", b), + "exceptions": lambda b: data.setdefault("rules", b) + } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: - if key in data: - data[key] = unescape_unicode(data[key]) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search if data.get("suffix_search"): @@ -579,11 +767,9 @@ cdef class Tokenizer: if data.get("rules"): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} - self._reset_cache([key for key in self._cache]) - self._reset_specials() - self._cache = PreshMap() - self._specials = PreshMap() - self._load_special_tokenization(data.get("rules", {})) + self._flush_cache() + self._flush_specials() + self._load_special_cases(data.get("rules", {})) return self @@ -591,3 +777,19 @@ cdef class Tokenizer: def _get_regex_pattern(regex): """Get a pattern string for a regex, or None if the pattern is None.""" return None if regex is None else regex.__self__.pattern + + +cdef extern from "" namespace "std" nogil: + void stdsort "sort"(vector[SpanC].iterator, + vector[SpanC].iterator, + bint (*)(SpanC, SpanC)) + + +cdef bint len_start_cmp(SpanC a, SpanC b) nogil: + if a.end - a.start == b.end - b.start: + return b.start < a.start + return a.end - a.start < b.end - b.start + + +cdef bint start_cmp(SpanC a, SpanC b) nogil: + return a.start < b.start diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 536ec8349..1aefa2b7c 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,9 +1,7 @@ -# coding: utf8 -from __future__ import unicode_literals - from .doc import Doc from .token import Token from .span import Span from ._serialize import DocBin +from .morphanalysis import MorphAnalysis -__all__ = ["Doc", "Token", "Span", "DocBin"] +__all__ = ["Doc", "Token", "Span", "DocBin", "MorphAnalysis"] diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a5d06491a..337c154a2 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -1,14 +1,11 @@ -# coding: utf8 # cython: infer_types=True # cython: bounds_check=False # cython: profile=True -from __future__ import unicode_literals - from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free from cymem.cymem cimport Pool -from thinc.neural.util import get_array_module +from thinc.api import get_array_module import numpy from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end @@ -16,7 +13,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport TAG +from ..attrs cimport TAG, MORPH from .underscore import is_writable_attr from ..attrs import intify_attrs @@ -68,6 +65,8 @@ cdef class Retokenizer: attrs["_"] = extensions else: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if MORPH in attrs: + self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH])) self.merges.append((span, attrs)) def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): @@ -99,6 +98,9 @@ cdef class Retokenizer: # NB: Since we support {"KEY": [value, value]} syntax here, this # will only "intify" the keys, not the values attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if MORPH in attrs: + for morph in attrs[MORPH]: + self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph)) head_offsets = [] for head in heads: if isinstance(head, Token): diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index b60a6d7b3..65b70d1b3 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,10 +1,7 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy import zlib import srsly -from thinc.neural.ops import NumpyOps +from thinc.api import NumpyOps from ..compat import copy_reg from ..tokens import Doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4aee21153..54d92f8b1 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,10 +1,6 @@ - -# coding: utf8 # cython: infer_types=True # cython: bounds_check=False # cython: profile=True -from __future__ import unicode_literals - cimport cython cimport numpy as np from libc.string cimport memcpy, memset @@ -15,7 +11,8 @@ import numpy import numpy.linalg import struct import srsly -from thinc.neural.util import get_array_module, copy_array +from thinc.api import get_array_module +from thinc.util import copy_array from .span cimport Span from .token cimport Token @@ -28,7 +25,7 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS from ..util import normalize_slice -from ..compat import is_config, copy_reg, pickle, basestring_ +from ..compat import copy_reg, pickle from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings from .. import util @@ -329,9 +326,7 @@ cdef class Doc: return "".join([t.text_with_ws for t in self]).encode("utf-8") def __str__(self): - if is_config(python3=True): - return self.__unicode__() - return self.__bytes__() + return self.__unicode__() def __repr__(self): return self.__str__() @@ -399,7 +394,9 @@ cdef class Doc: return 0.0 vector = self.vector xp = get_array_module(vector) - return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() @property def has_vector(self): @@ -507,7 +504,7 @@ cdef class Doc: token = &self.c[i] if token.ent_iob == 1: if start == -1: - seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]] + seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] raise ValueError(Errors.E093.format(seq=" ".join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0: if start != -1: @@ -685,7 +682,7 @@ cdef class Doc: cdef np.ndarray[attr_t, ndim=2] output # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 - if isinstance(py_attr_ids, basestring_): + if isinstance(py_attr_ids, str): # Handle inputs like doc.to_array('ORTH') py_attr_ids = [py_attr_ids] elif not hasattr(py_attr_ids, "__iter__"): @@ -774,7 +771,7 @@ cdef class Doc: """ # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 - if isinstance(attrs, basestring_): + if isinstance(attrs, str): # Handle inputs like doc.to_array('ORTH') attrs = [attrs] elif not hasattr(attrs, "__iter__"): @@ -813,7 +810,7 @@ cdef class Doc: if attr_ids[j] != TAG: Token.set_struct_attr(token, attr_ids[j], array[i, j]) # Set flags - self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs) + self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) # If document is parsed, set children if self.is_parsed: diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd index 22844454a..9510875c9 100644 --- a/spacy/tokens/morphanalysis.pxd +++ b/spacy/tokens/morphanalysis.pxd @@ -5,5 +5,5 @@ from ..structs cimport MorphAnalysisC cdef class MorphAnalysis: cdef readonly Vocab vocab - cdef hash_t key + cdef readonly hash_t key cdef MorphAnalysisC c diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e09870741..ed987f4e4 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,15 +1,14 @@ from libc.string cimport memset +cimport numpy as np from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t -from ..morphology cimport list_features, check_feature, get_field, tag_to_json - -from ..strings import get_string_id +from ..morphology cimport list_features, check_feature, get_by_field cdef class MorphAnalysis: """Control access to morphological features for a token.""" - def __init__(self, Vocab vocab, features=tuple()): + def __init__(self, Vocab vocab, features=dict()): self.vocab = vocab self.key = self.vocab.morphology.add(features) analysis = self.vocab.morphology.tags.get(self.key) @@ -33,7 +32,7 @@ cdef class MorphAnalysis: def __contains__(self, feature): """Test whether the morphological analysis contains some feature.""" - cdef attr_t feat_id = get_string_id(feature) + cdef attr_t feat_id = self.vocab.strings.as_int(feature) return check_feature(&self.c, feat_id) def __iter__(self): @@ -55,369 +54,28 @@ cdef class MorphAnalysis: def __hash__(self): return self.key - def get(self, unicode field): + def __eq__(self, other): + return self.key == other.key + + def __ne__(self, other): + return self.key != other.key + + def get(self, field): """Retrieve a feature by field.""" - cdef int field_id = self.vocab.morphology._feat_map.attr2field[field] - return self.vocab.strings[get_field(&self.c, field_id)] + cdef attr_t field_id = self.vocab.strings.as_int(field) + cdef np.ndarray results = get_by_field(&self.c, field_id) + return [self.vocab.strings[result] for result in results] def to_json(self): - """Produce a json serializable representation, which will be a list of - strings. + """Produce a json serializable representation as a UD FEATS-style + string. """ - return tag_to_json(&self.c) - - @property - def is_base_form(self): - raise NotImplementedError - - @property - def pos(self): - return self.c.pos - - @property - def pos_(self): - return self.vocab.strings[self.c.pos] - - property id: - def __get__(self): - return self.key - - property abbr: - def __get__(self): - return self.c.abbr - - property adp_type: - def __get__(self): - return self.c.adp_type - - property adv_type: - def __get__(self): - return self.c.adv_type - - property animacy: - def __get__(self): - return self.c.animacy - - property aspect: - def __get__(self): - return self.c.aspect - - property case: - def __get__(self): - return self.c.case - - property conj_type: - def __get__(self): - return self.c.conj_type - - property connegative: - def __get__(self): - return self.c.connegative - - property definite: - def __get__(self): - return self.c.definite - - property degree: - def __get__(self): - return self.c.degree - - property derivation: - def __get__(self): - return self.c.derivation - - property echo: - def __get__(self): - return self.c.echo - - property foreign: - def __get__(self): - return self.c.foreign - - property gender: - def __get__(self): - return self.c.gender - - property hyph: - def __get__(self): - return self.c.hyph - - property inf_form: - def __get__(self): - return self.c.inf_form - - property mood: - def __get__(self): - return self.c.mood - - property name_type: - def __get__(self): - return self.c.name_type - - property negative: - def __get__(self): - return self.c.negative - - property noun_type: - def __get__(self): - return self.c.noun_type - - property number: - def __get__(self): - return self.c.number - - property num_form: - def __get__(self): - return self.c.num_form - - property num_type: - def __get__(self): - return self.c.num_type - - property num_value: - def __get__(self): - return self.c.num_value - - property part_form: - def __get__(self): - return self.c.part_form - - property part_type: - def __get__(self): - return self.c.part_type - - property person: - def __get__(self): - return self.c.person - - property polite: - def __get__(self): - return self.c.polite - - property polarity: - def __get__(self): - return self.c.polarity - - property poss: - def __get__(self): - return self.c.poss - - property prefix: - def __get__(self): - return self.c.prefix - - property prep_case: - def __get__(self): - return self.c.prep_case - - property pron_type: - def __get__(self): - return self.c.pron_type - - property punct_side: - def __get__(self): - return self.c.punct_side - - property punct_type: - def __get__(self): - return self.c.punct_type - - property reflex: - def __get__(self): - return self.c.reflex - - property style: - def __get__(self): - return self.c.style - - property style_variant: - def __get__(self): - return self.c.style_variant - - property tense: - def __get__(self): - return self.c.tense - - property typo: - def __get__(self): - return self.c.typo - - property verb_form: - def __get__(self): - return self.c.verb_form - - property voice: - def __get__(self): - return self.c.voice - - property verb_type: - def __get__(self): - return self.c.verb_type - - property abbr_: - def __get__(self): - return self.vocab.strings[self.c.abbr] - - property adp_type_: - def __get__(self): - return self.vocab.strings[self.c.adp_type] - - property adv_type_: - def __get__(self): - return self.vocab.strings[self.c.adv_type] - - property animacy_: - def __get__(self): - return self.vocab.strings[self.c.animacy] - - property aspect_: - def __get__(self): - return self.vocab.strings[self.c.aspect] - - property case_: - def __get__(self): - return self.vocab.strings[self.c.case] - - property conj_type_: - def __get__(self): - return self.vocab.strings[self.c.conj_type] - - property connegative_: - def __get__(self): - return self.vocab.strings[self.c.connegative] - - property definite_: - def __get__(self): - return self.vocab.strings[self.c.definite] - - property degree_: - def __get__(self): - return self.vocab.strings[self.c.degree] - - property derivation_: - def __get__(self): - return self.vocab.strings[self.c.derivation] - - property echo_: - def __get__(self): - return self.vocab.strings[self.c.echo] - - property foreign_: - def __get__(self): - return self.vocab.strings[self.c.foreign] - - property gender_: - def __get__(self): - return self.vocab.strings[self.c.gender] - - property hyph_: - def __get__(self): - return self.vocab.strings[self.c.hyph] - - property inf_form_: - def __get__(self): - return self.vocab.strings[self.c.inf_form] - - property name_type_: - def __get__(self): - return self.vocab.strings[self.c.name_type] - - property negative_: - def __get__(self): - return self.vocab.strings[self.c.negative] - - property mood_: - def __get__(self): - return self.vocab.strings[self.c.mood] - - property number_: - def __get__(self): - return self.vocab.strings[self.c.number] - - property num_form_: - def __get__(self): - return self.vocab.strings[self.c.num_form] - - property num_type_: - def __get__(self): - return self.vocab.strings[self.c.num_type] - - property num_value_: - def __get__(self): - return self.vocab.strings[self.c.num_value] - - property part_form_: - def __get__(self): - return self.vocab.strings[self.c.part_form] - - property part_type_: - def __get__(self): - return self.vocab.strings[self.c.part_type] - - property person_: - def __get__(self): - return self.vocab.strings[self.c.person] - - property polite_: - def __get__(self): - return self.vocab.strings[self.c.polite] - - property polarity_: - def __get__(self): - return self.vocab.strings[self.c.polarity] - - property poss_: - def __get__(self): - return self.vocab.strings[self.c.poss] - - property prefix_: - def __get__(self): - return self.vocab.strings[self.c.prefix] - - property prep_case_: - def __get__(self): - return self.vocab.strings[self.c.prep_case] - - property pron_type_: - def __get__(self): - return self.vocab.strings[self.c.pron_type] - - property punct_side_: - def __get__(self): - return self.vocab.strings[self.c.punct_side] - - property punct_type_: - def __get__(self): - return self.vocab.strings[self.c.punct_type] - - property reflex_: - def __get__(self): - return self.vocab.strings[self.c.reflex] - - property style_: - def __get__(self): - return self.vocab.strings[self.c.style] - - property style_variant_: - def __get__(self): - return self.vocab.strings[self.c.style_variant] - - property tense_: - def __get__(self): - return self.vocab.strings[self.c.tense] - - property typo_: - def __get__(self): - return self.vocab.strings[self.c.typo] - - property verb_form_: - def __get__(self): - return self.vocab.strings[self.c.verb_form] - - property voice_: - def __get__(self): - return self.vocab.strings[self.c.voice] - - property verb_type_: - def __get__(self): - return self.vocab.strings[self.c.verb_type] + morph_string = self.vocab.strings[self.c.key] + if morph_string == self.vocab.morphology.EMPTY_MORPH: + return "" + return morph_string + + def to_dict(self): + """Produce a dict representation. + """ + return self.vocab.morphology.feats_to_dict(self.to_json()) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 24857790b..d6b50b5f4 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,12 +1,10 @@ -# coding: utf8 from __future__ import unicode_literals - cimport numpy as np from libc.math cimport sqrt import numpy import numpy.linalg -from thinc.neural.util import get_array_module +from thinc.api import get_array_module from collections import defaultdict from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix @@ -20,7 +18,6 @@ from ..lexeme cimport Lexeme from ..symbols cimport dep from ..util import normalize_slice -from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import deprecation_warning from .underscore import Underscore, get_ext_args @@ -110,9 +107,9 @@ cdef class Span: self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) else: self.end_char = 0 - if isinstance(label, basestring_): + if isinstance(label, str): label = doc.vocab.strings.add(label) - if isinstance(kb_id, basestring_): + if isinstance(kb_id, str): kb_id = doc.vocab.strings.add(kb_id) if label not in doc.vocab.strings: raise ValueError(Errors.E084.format(label=label)) @@ -127,22 +124,27 @@ cdef class Span: return False else: return True - # Eq + # < if op == 0: return self.start_char < other.start_char + # <= elif op == 1: return self.start_char <= other.start_char + # == elif op == 2: - return self.start_char == other.start_char and self.end_char == other.end_char + return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) == (other.doc, other.start_char, other.end_char, other.label, other.kb_id) + # != elif op == 3: - return self.start_char != other.start_char or self.end_char != other.end_char + return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) != (other.doc, other.start_char, other.end_char, other.label, other.kb_id) + # > elif op == 4: return self.start_char > other.start_char + # >= elif op == 5: return self.start_char >= other.start_char def __hash__(self): - return hash((self.doc, self.label, self.start_char, self.end_char)) + return hash((self.doc, self.start_char, self.end_char, self.label, self.kb_id)) def __len__(self): """Get the number of tokens in the span. @@ -157,9 +159,7 @@ cdef class Span: return self.end - self.start def __repr__(self): - if is_config(python3=True): - return self.text - return self.text.encode("utf-8") + return self.text def __getitem__(self, object i): """Get a `Token` or a `Span` object @@ -478,7 +478,7 @@ cdef class Span: @property def tensor(self): """The span's slice of the doc's tensor. - + RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array representing the span's semantics. """ diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index cbca55c40..0d25974f3 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -43,6 +43,8 @@ cdef class Token: return token.pos elif feat_name == TAG: return token.tag + elif feat_name == MORPH: + return token.morph elif feat_name == DEP: return token.dep elif feat_name == HEAD: @@ -73,6 +75,8 @@ cdef class Token: token.pos = value elif feat_name == TAG: token.tag = value + elif feat_name == MORPH: + token.morph = value elif feat_name == DEP: token.dep = value elif feat_name == HEAD: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8b15a4223..379da6c77 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,7 +1,4 @@ # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals - from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free # Compiler crashes on memory view coercion without this. Should report bug. @@ -10,7 +7,7 @@ cimport numpy as np np.import_array() import numpy -from thinc.neural.util import get_array_module +from thinc.api import get_array_module from ..typedefs cimport hash_t from ..lexeme cimport Lexeme @@ -23,7 +20,6 @@ from ..symbols cimport conj from .. import parts_of_speech from .. import util -from ..compat import is_config from ..errors import Errors, Warnings, user_warning, models_warning from .underscore import Underscore, get_ext_args from .morphanalysis cimport MorphAnalysis @@ -122,9 +118,7 @@ cdef class Token: return self.text.encode('utf8') def __str__(self): - if is_config(python3=True): - return self.__unicode__() - return self.__bytes__() + return self.__unicode__() def __repr__(self): return self.__str__() @@ -223,6 +217,14 @@ cdef class Token: def morph(self): return MorphAnalysis.from_id(self.vocab, self.c.morph) + property morph_: + def __get__(self): + return str(MorphAnalysis.from_id(self.vocab, self.c.morph)) + + def __set__(self, features): + cdef hash_t key = self.vocab.morphology.add(features) + self.c.morph = key + @property def lex_id(self): """RETURNS (int): Sequential ID of the token's lexical type.""" diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index b36fe9294..328851945 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import functools import copy diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index bd5b38958..b43814268 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -2,7 +2,9 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t from libc.stdint cimport uint8_t +ctypedef float weight_t ctypedef uint64_t hash_t +ctypedef uint64_t class_t ctypedef char* utf8_t ctypedef uint64_t attr_t ctypedef uint64_t flags_t diff --git a/spacy/util.py b/spacy/util.py index 9b96b2f5e..465b9645e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,14 +1,13 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - import os import importlib +import importlib.util import re from pathlib import Path import random -from collections import OrderedDict -from thinc.neural._classes.model import Model -from thinc.neural.ops import NumpyOps +from typing import List +import thinc +import thinc.config +from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu import functools import itertools import numpy.random @@ -16,10 +15,6 @@ import srsly import catalogue import sys -try: - import jsonschema -except ImportError: - jsonschema = None try: import cupy.random @@ -27,16 +22,13 @@ except ImportError: cupy = None from .symbols import ORTH -from .compat import cupy, CudaStream, path2str, basestring_, unicode_ -from .compat import import_file -from .errors import Errors, Warnings, deprecation_warning +from .compat import cupy, CudaStream +from .errors import Errors, Warnings, deprecation_warning, user_warning - -_data_path = Path(__file__).parent / "data" _PRINT_ENV = False -class registry(object): +class registry(thinc.registry): languages = catalogue.create("spacy", "languages", entry_points=True) architectures = catalogue.create("spacy", "architectures", entry_points=True) lookups = catalogue.create("spacy", "lookups", entry_points=True) @@ -71,7 +63,7 @@ def get_lang_class(lang): return registry.languages.get(lang) else: try: - module = importlib.import_module(".lang.%s" % lang, "spacy") + module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: raise ImportError(Errors.E048.format(lang=lang, err=err)) set_lang_class(lang, getattr(module, module.__all__[0])) @@ -87,27 +79,6 @@ def set_lang_class(name, cls): registry.languages.register(name, func=cls) -def get_data_path(require_exists=True): - """Get path to spaCy data directory. - - require_exists (bool): Only return path if it exists, otherwise None. - RETURNS (Path or None): Data path or None. - """ - if not require_exists: - return _data_path - else: - return _data_path if _data_path.exists() else None - - -def set_data_path(path): - """Set path to spaCy data directory. - - path (unicode or Path): Path to new data directory. - """ - global _data_path - _data_path = ensure_path(path) - - def make_layer(arch_config): arch_func = registry.architectures.get(arch_config["arch"]) return arch_func(arch_config["config"]) @@ -119,7 +90,7 @@ def ensure_path(path): path: Anything. If string, it's converted to Path. RETURNS: Path or original argument. """ - if isinstance(path, basestring_): + if isinstance(path, str): return Path(path) else: return path @@ -138,7 +109,7 @@ def load_language_data(path): path = path.with_suffix(path.suffix + ".gz") if path.exists(): return srsly.read_gzip_json(path) - raise ValueError(Errors.E160.format(path=path2str(path))) + raise ValueError(Errors.E160.format(path=path)) def get_module_path(module): @@ -148,18 +119,13 @@ def get_module_path(module): def load_model(name, **overrides): - """Load a model from a shortcut link, package or data path. + """Load a model from a package or data path. - name (unicode): Package name, shortcut link or model path. + name (unicode): Package name or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. """ - data_path = get_data_path() - if not data_path or not data_path.exists(): - raise IOError(Errors.E049.format(path=path2str(data_path))) - if isinstance(name, basestring_): # in data dir / shortcut - if name in set([d.name for d in data_path.iterdir()]): - return load_model_from_link(name, **overrides) + if isinstance(name, str): # name or string path if is_package(name): # installed as package return load_model_from_package(name, **overrides) if Path(name).exists(): # path to model data directory @@ -169,16 +135,6 @@ def load_model(name, **overrides): raise IOError(Errors.E050.format(name=name)) -def load_model_from_link(name, **overrides): - """Load a model from a shortcut link, or directory in spaCy data path.""" - path = get_data_path() / name / "__init__.py" - try: - cls = import_file(name, path) - except AttributeError: - raise IOError(Errors.E051.format(name=name)) - return cls.load(**overrides) - - def load_model_from_package(name, **overrides): """Load a model from an installed package.""" cls = importlib.import_module(name) @@ -221,13 +177,30 @@ def load_model_from_init_py(init_file, **overrides): """ model_path = Path(init_file).parent meta = get_model_meta(model_path) - data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"]) + data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}" data_path = model_path / data_dir if not model_path.exists(): - raise IOError(Errors.E052.format(path=path2str(data_path))) + raise IOError(Errors.E052.format(path=data_path)) return load_model_from_path(data_path, meta, **overrides) +def load_from_config(path, create_objects=False): + """Load a Thinc-formatted config file, optionally filling in objects where + the config references registry entries. See "Thinc config files" for details. + + path (unicode or Path): Path to the config file + create_objects (bool): Whether to automatically create objects when the config + references registry entries. Defaults to False. + + RETURNS (dict): The objects from the config file. + """ + config = thinc.config.Config().from_disk(path) + if create_objects: + return registry.make_from_config(config, validate=True) + else: + return config + + def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. @@ -236,7 +209,7 @@ def get_model_meta(path): """ model_path = ensure_path(path) if not model_path.exists(): - raise IOError(Errors.E052.format(path=path2str(model_path))) + raise IOError(Errors.E052.format(path=model_path)) meta_path = model_path / "meta.json" if not meta_path.is_file(): raise IOError(Errors.E053.format(path=meta_path)) @@ -302,9 +275,10 @@ def get_component_name(component): def get_cuda_stream(require=False, non_blocking=True): + ops = get_current_ops() if CudaStream is None: return None - elif isinstance(Model.ops, NumpyOps): + elif isinstance(ops, NumpyOps): return None else: return CudaStream(non_blocking=non_blocking) @@ -319,6 +293,14 @@ def get_async(stream, numpy_array): return array +def eg2doc(example): + """Get a Doc object from an Example (or if it's a Doc, use it directly)""" + # Put the import here to avoid circular import problems + from .tokens.doc import Doc + + return example if isinstance(example, Doc) else example.doc + + def env_opt(name, default=None): if type(default) is float: type_convert = float @@ -417,7 +399,7 @@ def update_exc(base_exceptions, *addition_dicts): exc = dict(base_exceptions) for additions in addition_dicts: for orth, token_attrs in additions.items(): - if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): + if not all(isinstance(attr[ORTH], str) for attr in token_attrs): raise ValueError(Errors.E055.format(key=orth, orths=token_attrs)) described_orth = "".join(attr[ORTH] for attr in token_attrs) if orth != described_orth: @@ -537,31 +519,27 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(items, size, tuples=True, count_words=len): +def minibatch_by_words(examples, size, tuples=True, count_words=len): """Create minibatches of a given number of words.""" if isinstance(size, int): size_ = itertools.repeat(size) + if isinstance(size, List): + size_ = iter(size) else: size_ = size - items = iter(items) + examples = iter(examples) while True: batch_size = next(size_) batch = [] while batch_size >= 0: try: - if tuples: - doc, gold = next(items) - else: - doc = next(items) + example = next(examples) except StopIteration: if batch: yield batch return - batch_size -= count_words(doc) - if tuples: - batch.append((doc, gold)) - else: - batch.append(doc) + batch_size -= count_words(example.doc) + batch.append(example) if batch: yield batch @@ -618,7 +596,7 @@ def filter_spans(spans): def to_bytes(getters, exclude): - serialized = OrderedDict() + serialized = {} for key, getter in getters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude: @@ -655,6 +633,20 @@ def from_disk(path, readers, exclude): return path +def import_file(name, loc): + """Import module from a file. Used to load models from a directory. + + name (unicode): Name of module to load. + loc (unicode / Path): Path to the file. + RETURNS: The loaded module. + """ + loc = str(loc) + spec = importlib.util.spec_from_file_location(name, str(loc)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. Disclaimer: NOT a general-purpose solution, only removes indentation and @@ -681,17 +673,7 @@ def escape_html(text): def use_gpu(gpu_id): - try: - import cupy.cuda.device - except ImportError: - return None - from thinc.neural.ops import CupyOps - - device = cupy.cuda.device.Device(gpu_id) - device.use() - Model.ops = CupyOps() - Model.Ops = CupyOps - return device + return require_gpu(gpu_id) def fix_random_seed(seed=0): @@ -701,43 +683,6 @@ def fix_random_seed(seed=0): cupy.random.seed(seed) -def get_json_validator(schema): - # We're using a helper function here to make it easier to change the - # validator that's used (e.g. different draft implementation), without - # having to change it all across the codebase. - # TODO: replace with (stable) Draft6Validator, if available - if jsonschema is None: - raise ValueError(Errors.E136) - return jsonschema.Draft4Validator(schema) - - -def validate_schema(schema): - """Validate a given schema. This just checks if the schema itself is valid.""" - validator = get_json_validator(schema) - validator.check_schema(schema) - - -def validate_json(data, validator): - """Validate data against a given JSON schema (see https://json-schema.org). - - data: JSON-serializable data to validate. - validator (jsonschema.DraftXValidator): The validator. - RETURNS (list): A list of error messages, if available. - """ - errors = [] - for err in sorted(validator.iter_errors(data), key=lambda e: e.path): - if err.path: - err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) - else: - err_path = "" - msg = err.message + " " + err_path - if err.context: # Error has suberrors, e.g. if schema uses anyOf - suberrs = [" - {}".format(suberr.message) for suberr in err.context] - msg += ":\n{}".format("".join(suberrs)) - errors.append(msg) - return errors - - def get_serialization_exclude(serializers, exclude, kwargs): """Helper function to validate serialization args and manage transition from keyword arguments (pre v2.1) to exclude argument. @@ -785,3 +730,39 @@ class DummyTokenizer(object): def from_disk(self, _path, **kwargs): return self + + +def link_vectors_to_models(vocab): + vectors = vocab.vectors + if vectors.name is None: + vectors.name = VECTORS_KEY + if vectors.data.size != 0: + user_warning(Warnings.W020.format(shape=vectors.data.shape)) + for word in vocab: + if word.orth in vectors.key2row: + word.rank = vectors.key2row[word.orth] + else: + word.rank = 0 + + +VECTORS_KEY = "spacy_pretrained_vectors" + + +def create_default_optimizer(): + ops = get_current_ops() + learn_rate = env_opt("learn_rate", 0.001) + beta1 = env_opt("optimizer_B1", 0.9) + beta2 = env_opt("optimizer_B2", 0.999) + eps = env_opt("optimizer_eps", 1e-8) + L2 = env_opt("L2_penalty", 1e-6) + grad_clip = env_opt("grad_norm_clip", 1.0) + optimizer = Adam( + learn_rate, + L2=L2, + beta1=beta1, + beta2=beta2, + eps=eps, + ops=ops, + grad_clip=grad_clip, + ) + return optimizer diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6b26bf123..0ade8b280 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,21 +1,15 @@ -# coding: utf8 -from __future__ import unicode_literals - cimport numpy as np from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset import functools import numpy -from collections import OrderedDict import srsly -from thinc.neural.util import get_array_module -from thinc.neural._classes.model import Model +from thinc.api import get_array_module, get_current_ops from .strings cimport StringStore from .strings import get_string_id -from .compat import basestring_, path2str from .errors import Errors from . import util @@ -74,7 +68,7 @@ cdef class Vectors: shape = (0,0) data = numpy.zeros(shape, dtype="f") self.data = data - self.key2row = OrderedDict() + self.key2row = {} if self.data is not None: self._unset = cppset[int]({i for i in range(self.data.shape[0])}) else: @@ -283,7 +277,11 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#add """ - key = get_string_id(key) + # use int for all keys and rows in key2row for more efficient access + # and serialization + key = int(get_string_id(key)) + if row is not None: + row = int(row) if row is None and key in self.key2row: row = self.key2row[key] elif row is None: @@ -339,7 +337,7 @@ cdef class Vectors: sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] scores[i:i+batch_size] = scores[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index] - + xp = get_array_module(self.data) # Round values really close to 1 or -1 scores = xp.around(scores, decimals=4, out=scores) @@ -347,7 +345,7 @@ cdef class Vectors: scores = xp.clip(scores, a_min=-1, a_max=1, out=scores) row2key = {row: key for key, row in self.key2row.items()} keys = xp.asarray( - [[row2key[row] for row in best_rows[i] if row in row2key] + [[row2key[row] for row in best_rows[i] if row in row2key] for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) @@ -372,7 +370,7 @@ cdef class Vectors: break else: raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) + bin_loc = path / f"vectors.{dims}.{dtype}.bin" xp = get_array_module(self.data) self.data = None with bin_loc.open("rb") as file_: @@ -402,10 +400,10 @@ cdef class Vectors: save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) - serializers = OrderedDict(( - ("vectors", lambda p: save_array(self.data, p.open("wb"))), - ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) - )) + serializers = { + "vectors": lambda p: save_array(self.data, p.open("wb")), + "key2row": lambda p: srsly.write_msgpack(p, self.key2row) + } return util.to_disk(path, serializers, []) def from_disk(self, path, **kwargs): @@ -431,15 +429,15 @@ cdef class Vectors: self.add(key, row=i) def load_vectors(path): - xp = Model.ops.xp + ops = get_current_ops() if path.exists(): - self.data = xp.load(str(path)) + self.data = ops.xp.load(str(path)) - serializers = OrderedDict(( - ("key2row", load_key2row), - ("keys", load_keys), - ("vectors", load_vectors), - )) + serializers = { + "key2row": load_key2row, + "keys": load_keys, + "vectors": load_vectors, + } util.from_disk(path, serializers, []) return self @@ -457,10 +455,10 @@ cdef class Vectors: else: return srsly.msgpack_dumps(self.data) - serializers = OrderedDict(( - ("key2row", lambda: srsly.msgpack_dumps(self.key2row)), - ("vectors", serialize_weights) - )) + serializers = { + "key2row": lambda: srsly.msgpack_dumps(self.key2row), + "vectors": serialize_weights + } return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): @@ -478,9 +476,9 @@ cdef class Vectors: else: self.data = srsly.msgpack_loads(b) - deserializers = OrderedDict(( - ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), - ("vectors", deserialize_weights) - )) + deserializers = { + "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)), + "vectors": deserialize_weights + } util.from_bytes(data, deserializers, []) return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3cf0095ee..a1929559f 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,11 +1,8 @@ -# coding: utf8 # cython: profile=True -from __future__ import unicode_literals from libc.string cimport memcpy import srsly -from collections import OrderedDict -from thinc.neural.util import get_array_module +from thinc.api import get_array_module from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -14,12 +11,12 @@ from .tokens.token cimport Token from .attrs cimport PROB, LANG, ORTH, TAG, POS from .structs cimport SerializedLexemeC -from .compat import copy_reg, basestring_ +from .compat import copy_reg from .errors import Errors from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM from .vectors import Vectors -from ._ml import link_vectors_to_models +from .util import link_vectors_to_models from .lookups import Lookups from . import util @@ -335,14 +332,14 @@ cdef class Vocab: """Retrieve a vector for a word in the vocabulary. Words can be looked up by string or int ID. If no vectors data is loaded, ValueError is raised. - - If `minn` is defined, then the resulting vector uses Fasttext's + + If `minn` is defined, then the resulting vector uses Fasttext's subword features by average over ngrams of `orth`. orth (int / unicode): The hash value of a word, or its unicode string. - minn (int): Minimum n-gram length used for Fasttext's ngram computation. + minn (int): Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. - maxn (int): Maximum n-gram length used for Fasttext's ngram computation. + maxn (int): Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. RETURNS (numpy.ndarray): A word vector. Size and shape determined by the `vocab.vectors` instance. Usually, a @@ -350,7 +347,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#get_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) word = self[orth].orth_ if orth in self.vectors.key2row: @@ -397,7 +394,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#set_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) if self.vectors.is_full and orth not in self.vectors: new_rows = max(100, int(self.vectors.shape[0]*1.3)) @@ -419,7 +416,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#has_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) return orth in self.vectors @@ -488,12 +485,12 @@ cdef class Vocab: else: return self.vectors.to_bytes() - getters = OrderedDict(( - ("strings", lambda: self.strings.to_bytes()), - ("lexemes", lambda: self.lexemes_to_bytes()), - ("vectors", deserialize_vectors), - ("lookups", lambda: self.lookups.to_bytes()) - )) + getters = { + "strings": lambda: self.strings.to_bytes(), + "lexemes": lambda: self.lexemes_to_bytes(), + "vectors": deserialize_vectors, + "lookups": lambda: self.lookups.to_bytes() + } exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -512,12 +509,12 @@ cdef class Vocab: else: return self.vectors.from_bytes(b) - setters = OrderedDict(( - ("strings", lambda b: self.strings.from_bytes(b)), - ("lexemes", lambda b: self.lexemes_from_bytes(b)), - ("vectors", lambda b: serialize_vectors(b)), - ("lookups", lambda b: self.lookups.from_bytes(b)) - )) + setters = { + "strings": lambda b: self.strings.from_bytes(b), + "lexemes": lambda b: self.lexemes_from_bytes(b), + "vectors": lambda b: serialize_vectors(b), + "lookups": lambda b: self.lookups.from_bytes(b) + } exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) if self.vectors.name is not None: diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 50ba0e3d9..c9c7a010c 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -677,50 +677,3 @@ of one entity) or when merging spans with | ----------- | -------- | -------------------- | | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | - -## Compatibility functions {#compat source="spacy/compaty.py"} - -All Python code is written in an **intersection of Python 2 and Python 3**. This -is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or -platform compatibility only lives in `spacy.compat`. To distinguish them from -the builtin functions, replacement functions are suffixed with an underscore, -e.g. `unicode_`. - -> #### Example -> -> ```python -> from spacy.compat import unicode_ -> -> compatible_unicode = unicode_("hello world") -> ``` - -| Name | Python 2 | Python 3 | -| -------------------- | ---------------------------------- | ----------- | -| `compat.bytes_` | `str` | `bytes` | -| `compat.unicode_` | `unicode` | `str` | -| `compat.basestring_` | `basestring` | `str` | -| `compat.input_` | `raw_input` | `input` | -| `compat.path2str` | `str(path)` with `.decode('utf8')` | `str(path)` | - -### compat.is_config {#compat.is_config tag="function"} - -Check if a specific configuration of Python version and operating system matches -the user's setup. Mostly used to display targeted error messages. - -> #### Example -> -> ```python -> from spacy.compat import is_config -> -> if is_config(python2=True, windows=True): -> print("You are using Python 2 on Windows.") -> ``` - -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------------------------- | -| `python2` | bool | spaCy is executed with Python 2.x. | -| `python3` | bool | spaCy is executed with Python 3.x. | -| `windows` | bool | spaCy is executed on Windows. | -| `linux` | bool | spaCy is executed on Linux. | -| `osx` | bool | spaCy is executed on OS X or macOS. | -| **RETURNS** | bool | Whether the specified configuration matches the user's platform. | diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 17fd8fa7b..6ea2b0721 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -8,9 +8,9 @@ menu: - ['Changelog', 'changelog'] --- -spaCy is compatible with **64-bit CPython 2.7 / 3.5+** and runs on -**Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are -available over [pip](https://pypi.python.org/pypi/spacy) and +spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**, +**macOS/OS X** and **Windows**. The latest spaCy releases are available over +[pip](https://pypi.python.org/pypi/spacy) and [conda](https://anaconda.org/conda-forge/spacy). > #### 📖 Looking for the old docs? @@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and > possible, the new docs also include notes on features that have changed in > v2.0, and features that were introduced in the new version. + + +We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8, +as we're still waiting for our CI providers and other tooling to support it. +This means that in order to run spaCy on Python 3.8, you'll need +[a compiler installed](#source) and compile the library and its Cython +dependencies locally. If this is causing problems for you, the easiest solution +is to **use Python 3.7** in the meantime. + + + ## Quickstart {hidden="true"} import QuickstartInstall from 'widgets/quickstart-install.js' @@ -195,14 +206,7 @@ Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/) -that matches the version that was used to compile your Python interpreter. For -official distributions these are: - -| Distribution | Version | -| ------------ | ------------------ | -| Python 2.7 | Visual Studio 2008 | -| Python 3.4 | Visual Studio 2010 | -| Python 3.5+ | Visual Studio 2015 | +that matches the version that was used to compile your Python interpreter. ### Run tests {#run-tests} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index b7b840999..7382f2b8c 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -367,7 +367,7 @@ tokens and a conditional message based on the document length. import spacy def my_component(doc): - print("After tokenization, this doc has {} tokens.".format(len(doc))) + print(f"After tokenization, this doc has {len(doc)} tokens.") print("The part-of-speech tags are:", [token.pos_ for token in doc]) if len(doc) < 10: print("This is a pretty short document.") @@ -602,7 +602,7 @@ There are three main types of extensions, which can be defined using the [these examples](/usage/examples#custom-components-attr-methods). ```python - Doc.set_extension("hello", method=lambda doc, name: "Hi {}!".format(name)) + Doc.set_extension("hello", method=lambda doc, name: f"Hi {name}!") assert doc._.hello("Bob") == "Hi Bob!" ``` diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index cae4f074a..f8866aec1 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1096,6 +1096,33 @@ with the patterns. When you load the model back in, all pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful model packages with binary weights _and_ rules included! +### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} + +When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, +the EntityRuler calls the nlp object to construct a doc object. This happens in case you try +to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to +extract matches based on the pattern's POS signature. + +In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. + +Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. + +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. + +Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. + +An easy workaround to make this function run faster is disabling the other language pipes +while adding the phrase patterns. + +```python +entityruler = EntityRuler(nlp) +patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)] + +other_pipes = [p for p in nlp.pipe_names if p != "tagger"] +with nlp.disable_pipes(*disable_pipes): + entityruler.add_patterns(patterns) +``` + ## Combining models and rules {#models-rules} You can combine statistical and rule-based components in a variety of ways. diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 5a3a95a53..479bdd264 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -304,12 +304,6 @@ print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' ``` -> #### What does 'L' at the end of a hash mean? -> -> If you return a hash value in the **Python 2 interpreter**, it'll show up as -> `3197928453018144401L`. The `L` just means "long integer" – it's **not** -> actually a part of the hash value. - Now that all strings are encoded, the entries in the vocabulary **don't need to include the word text** themselves. Instead, they can look it up in the `StringStore` via its hash value. Each entry in the vocabulary, also called @@ -857,17 +851,16 @@ def put_spans_around_tokens(doc): and you can calculate what you need, e.g.
,

etc.) """ output = [] - html = '{word}{space}' for token in doc: if token.is_space: output.append(token.text) else: - classes = "pos-{} dep-{}".format(token.pos_, token.dep_) - output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) + classes = f"pos-{token.pos_} dep-{token.dep_}" + output.append(f'{token.text}{token.whitespace_}') string = "".join(output) string = string.replace("\\n", "") string = string.replace("\\t", " ") - return "

{}
".format(string) + return f"
{string}
" nlp = spacy.load("en_core_web_sm") diff --git a/website/meta/universe.json b/website/meta/universe.json index cf5978edc..e0e48a916 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -999,6 +999,17 @@ "author": "Graphbrain", "category": ["standalone"] }, + { + "type": "education", + "id": "nostarch-nlp-python", + "title": "Natural Language Processing Using Python", + "slogan": "No Starch Press, 2020", + "description": "Natural Language Processing Using Python is an introduction to natural language processing (NLP), the task of converting human language into data that a computer can process. The book uses spaCy, a leading Python library for NLP, to guide readers through common NLP tasks related to generating and understanding human language with code. It addresses problems like understanding a user's intent, continuing a conversation with a human, and maintaining the state of a conversation.", + "cover": "https://nostarch.com/sites/default/files/styles/uc_product_full/public/NaturalLanguageProcessing_final_v01.jpg", + "url": "https://nostarch.com/NLPPython", + "author": "Yuli Vasiliev", + "category": ["books"] + }, { "type": "education", "id": "oreilly-python-ds",