From e48a09df4eb580bd4ffb2270d9034dc69e081742 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 11 Nov 2019 17:35:27 +0100 Subject: [PATCH 001/187] Example class for training data (#4543) * OrigAnnot class instead of gold.orig_annot list of zipped tuples * from_orig to replace from_annot_tuples * rename to RawAnnot * some unit tests for GoldParse creation and internal format * removing orig_annot and switching to lists instead of tuple * rewriting tuples to use RawAnnot (+ debug statements, WIP) * fix pop() changing the data * small fixes * pop-append fixes * return RawAnnot for existing GoldParse to have uniform interface * clean up imports * fix merge_sents * add unit test for 4402 with new structure (not working yet) * introduce DocAnnot * typo fixes * add unit test for merge_sents * rename from_orig to from_raw * fixing unit tests * fix nn parser * read_annots to produce text, doc_annot pairs * _make_golds fix * rename golds_to_gold_annots * small fixes * fix encoding * have golds_to_gold_annots use DocAnnot * missed a spot * merge_sents as function in DocAnnot * allow specifying only part of the token-level annotations * refactor with Example class + underlying dicts * pipeline components to work with Example objects (wip) * input checking * fix yielding * fix calls to update * small fixes * fix scorer unit test with new format * fix kwargs order * fixes for ud and conllu scripts * fix reading data for conllu script * add in proper errors (not fixed numbering yet to avoid merge conflicts) * fixing few more small bugs * fix EL script --- bin/ud/ud_run_test.py | 13 +- bin/ud/ud_train.py | 68 +-- bin/wiki_entity_linking/kb_creator.py | 2 +- .../wikidata_train_entity_linker.py | 4 +- examples/training/conllu.py | 78 +-- examples/training/ner_multitask_objective.py | 29 +- examples/training/pretrain_textcat.py | 5 +- examples/training/rehearsal.py | 3 +- examples/training/train_entity_linker.py | 4 +- examples/training/train_intent_parser.py | 3 +- examples/training/train_ner.py | 4 +- examples/training/train_new_entity_type.py | 3 +- examples/training/train_parser.py | 3 +- examples/training/train_tagger.py | 3 +- examples/training/train_textcat.py | 3 +- spacy/cli/converters/conllu2json.py | 62 +- spacy/cli/debug_data.py | 40 +- spacy/cli/evaluate.py | 8 +- spacy/cli/pretrain.py | 3 +- spacy/cli/train.py | 39 +- spacy/errors.py | 6 + spacy/gold.pxd | 32 +- spacy/gold.pyx | 575 +++++++++++++----- spacy/language.py | 119 ++-- spacy/pipeline/morphologizer.pyx | 12 +- spacy/pipeline/pipes.pyx | 337 ++++++---- spacy/scorer.py | 28 +- spacy/syntax/arc_eager.pyx | 12 +- spacy/syntax/ner.pyx | 6 +- spacy/syntax/nn_parser.pyx | 71 +-- spacy/syntax/nonproj.pyx | 53 +- spacy/tests/parser/test_add_label.py | 4 +- spacy/tests/parser/test_arc_eager_oracle.py | 11 +- spacy/tests/parser/test_neural_parser.py | 4 +- spacy/tests/parser/test_preset_sbd.py | 2 +- spacy/tests/pipeline/test_textcat.py | 2 +- spacy/tests/regression/test_issue1-1000.py | 2 +- spacy/tests/regression/test_issue1501-2000.py | 8 +- spacy/tests/regression/test_issue2501-3000.py | 2 +- spacy/tests/regression/test_issue3611.py | 4 +- spacy/tests/regression/test_issue4030.py | 4 +- spacy/tests/regression/test_issue4348.py | 3 +- spacy/tests/regression/test_issue4402.py | 5 +- spacy/tests/test_gold.py | 163 ++++- spacy/tests/test_language.py | 18 +- spacy/tests/test_scorer.py | 16 +- spacy/tokenizer.pyx | 2 +- spacy/util.py | 16 +- 48 files changed, 1178 insertions(+), 716 deletions(-) diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py index 7cb270d84..70c6be0d0 100644 --- a/bin/ud/ud_run_test.py +++ b/bin/ud/ud_run_test.py @@ -13,23 +13,12 @@ import srsly import spacy import spacy.util from spacy.tokens import Token, Doc -from spacy.gold import GoldParse -from spacy.util import compounding, minibatch_by_words -from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher -# from spacy.morphology import Fused_begin, Fused_inside -from spacy import displacy -from collections import defaultdict, Counter -from timeit import default_timer as timer Fused_begin = None Fused_inside = None -import itertools -import random -import numpy.random - from . import conll17_ud_eval from spacy import lang @@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus): return nlp -def initialize_pipeline(nlp, docs, golds, config, device): +def initialize_pipeline(nlp, examples, config, device): nlp.add_pipe(nlp.create_pipe("parser")) return nlp diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 945bf57eb..b6a44b861 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -7,24 +7,20 @@ from __future__ import unicode_literals import plac from pathlib import Path import re -import sys import json import spacy import spacy.util from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc -from spacy.gold import GoldParse +from spacy.gold import GoldParse, Example from spacy.util import compounding, minibatch, minibatch_by_words from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy -from collections import defaultdict, Counter -from timeit import default_timer as timer +from collections import defaultdict -import itertools import random -import numpy.random from spacy import lang from spacy.lang import zh @@ -56,7 +52,7 @@ def read_data( max_doc_length=None, limit=None, ): - """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, + """Read the CONLLU format into Example objects. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.""" @@ -101,15 +97,16 @@ def read_data( docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds + return golds_to_gold_data(docs, golds) if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds - return docs, golds + return golds_to_gold_data(docs, golds) + return golds_to_gold_data(docs, golds) + def _parse_morph_string(morph_string): if morph_string == '_': @@ -123,6 +120,7 @@ def _parse_morph_string(morph_string): output.append('%s_%s' % (key, value.lower())) return set(output) + def read_conllu(file_): docs = [] sent = [] @@ -183,16 +181,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): ############################# -def golds_to_gold_tuples(docs, golds): - """Get out the annoying 'tuples' format used by begin_training, given the +def golds_to_gold_data(docs, golds): + """Get out the training data format used by begin_training, given the GoldParse objects.""" - tuples = [] + data = [] for doc, gold in zip(docs, golds): - text = doc.text - ids, words, tags, heads, labels, iob = zip(*gold.orig_annot) - sents = [((ids, words, tags, heads, labels, iob), [])] - tuples.append((text, sents)) - return tuples + example = Example(doc=doc) + example.add_doc_annotation(cats=gold.cats) + token_annotation_dict = gold.orig.to_dict() + example.add_token_annotation(**token_annotation_dict) + example.goldparse = gold + data.append(example) + return data ############## @@ -348,7 +348,7 @@ def load_nlp(corpus, config, vectors=None): return nlp -def initialize_pipeline(nlp, docs, golds, config, device): +def initialize_pipeline(nlp, examples, config, device): nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False})) nlp.add_pipe(nlp.create_pipe("morphologizer")) nlp.add_pipe(nlp.create_pipe("parser")) @@ -356,14 +356,15 @@ def initialize_pipeline(nlp, docs, golds, config, device): nlp.parser.add_multitask_objective("tag") if config.multitask_sent: nlp.parser.add_multitask_objective("sent_start") - for gold in golds: + for ex in examples: + gold = ex.gold for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) if torch is not None and device != -1: torch.set_default_tensor_type("torch.cuda.FloatTensor") optimizer = nlp.begin_training( - lambda: golds_to_gold_tuples(docs, golds), + lambda: examples, device=device, subword_features=config.subword_features, conv_depth=config.conv_depth, @@ -504,20 +505,20 @@ def main( print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config, vectors=vectors_dir) - docs, golds = read_data( + examples = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, ) - optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) + optimizer = initialize_pipeline(nlp, examples, config, gpu_device) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) beam_prob = compounding(0.2, 0.8, 1.001) for i in range(config.nr_epoch): - docs, golds = read_data( + examples = read_data( nlp, paths.train.conllu.open(encoding="utf8"), paths.train.text.open(encoding="utf8"), @@ -526,22 +527,19 @@ def main( oracle_segments=use_oracle_segments, raw_text=not use_oracle_segments, ) - Xs = list(zip(docs, golds)) - random.shuffle(Xs) + random.shuffle(examples) if config.batch_by_words: - batches = minibatch_by_words(Xs, size=batch_sizes) + batches = minibatch_by_words(examples, size=batch_sizes) else: - batches = minibatch(Xs, size=batch_sizes) + batches = minibatch(examples, size=batch_sizes) losses = {} - n_train_words = sum(len(doc) for doc in docs) + n_train_words = sum(len(ex.doc) for ex in examples) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: - batch_docs, batch_gold = zip(*batch) - pbar.update(sum(len(doc) for doc in batch_docs)) + pbar.update(sum(len(ex.doc) for ex in batch)) nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.update( - batch_docs, - batch_gold, + batch, sgd=optimizer, drop=config.dropout, losses=losses, diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index 7778fc701..8691308e0 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -46,7 +46,7 @@ def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_fre " cf. https://spacy.io/usage/models#languages." ) - logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq)) + logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq)) entity_frequencies = io.read_entity_to_count(entity_freq_path) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py index 8635ae547..6b5f4c30d 100644 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py @@ -131,10 +131,8 @@ def main( with nlp.disable_pipes(*other_pipes): for batch in batches: try: - docs, golds = zip(*batch) nlp.update( - docs=docs, - golds=golds, + examples=batch, sgd=optimizer, drop=dropout, losses=losses, diff --git a/examples/training/conllu.py b/examples/training/conllu.py index dfc790456..ba3cf450c 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -11,10 +11,9 @@ import json import spacy import spacy.util from spacy.tokens import Token, Doc -from spacy.gold import GoldParse +from spacy.gold import GoldParse, Example from spacy.syntax.nonproj import projectivize -from collections import defaultdict, Counter -from timeit import default_timer as timer +from collections import defaultdict from spacy.matcher import Matcher import itertools @@ -33,25 +32,25 @@ random.seed(0) numpy.random.seed(0) -def minibatch_by_words(items, size=5000): - random.shuffle(items) +def minibatch_by_words(examples, size=5000): + random.shuffle(examples) if isinstance(size, int): size_ = itertools.repeat(size) else: size_ = size - items = iter(items) + examples = iter(examples) while True: batch_size = next(size_) batch = [] while batch_size >= 0: try: - doc, gold = next(items) + example = next(examples) except StopIteration: if batch: yield batch return - batch_size -= len(doc) - batch.append((doc, gold)) + batch_size -= len(example.doc) + batch.append(example) if batch: yield batch else: @@ -78,7 +77,7 @@ def read_data( max_doc_length=None, limit=None, ): - """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, + """Read the CONLLU format into Example objects. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.""" @@ -119,15 +118,15 @@ def read_data( docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds + return golds_to_gold_data(docs, golds) if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: - return docs, golds - return docs, golds + return golds_to_gold_data(docs, golds) + return golds_to_gold_data(docs, golds) def read_conllu(file_): @@ -181,16 +180,18 @@ def _make_gold(nlp, text, sent_annots): ############################# -def golds_to_gold_tuples(docs, golds): - """Get out the annoying 'tuples' format used by begin_training, given the +def golds_to_gold_data(docs, golds): + """Get out the training data format used by begin_training, given the GoldParse objects.""" - tuples = [] + data = [] for doc, gold in zip(docs, golds): - text = doc.text - ids, words, tags, heads, labels, iob = zip(*gold.orig_annot) - sents = [((ids, words, tags, heads, labels, iob), [])] - tuples.append((text, sents)) - return tuples + example = Example(doc=doc) + example.add_doc_annotation(cats=gold.cats) + token_annotation_dict = gold.orig.to_dict() + example.add_token_annotation(**token_annotation_dict) + example.goldparse = gold + data.append(example) + return data ############## @@ -290,9 +291,9 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu) -Token.set_extension("begins_fused", default=False) -Token.set_extension("inside_fused", default=False) +Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True) +Token.set_extension("begins_fused", default=False, force=True) +Token.set_extension("inside_fused", default=False, force=True) ################## @@ -308,7 +309,7 @@ def load_nlp(corpus, config): return nlp -def initialize_pipeline(nlp, docs, golds, config): +def initialize_pipeline(nlp, examples, config): nlp.add_pipe(nlp.create_pipe("parser")) if config.multitask_tag: nlp.parser.add_multitask_objective("tag") @@ -316,18 +317,19 @@ def initialize_pipeline(nlp, docs, golds, config): nlp.parser.add_multitask_objective("sent_start") nlp.parser.moves.add_action(2, "subtok") nlp.add_pipe(nlp.create_pipe("tagger")) - for gold in golds: - for tag in gold.tags: + for ex in examples: + for tag in ex.gold.tags: if tag is not None: nlp.tagger.add_label(tag) # Replace labels that didn't make the frequency cutoff actions = set(nlp.parser.labels) label_set = set([act.split("-")[1] for act in actions if "-" in act]) - for gold in golds: + for ex in examples: + gold = ex.gold for i, label in enumerate(gold.labels): if label is not None and label not in label_set: gold.labels[i] = label.split("||")[0] - return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) + return nlp.begin_training(lambda: examples) ######################## @@ -401,28 +403,26 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config) - docs, golds = read_data( + examples = read_data( nlp, - paths.train.conllu.open(), - paths.train.text.open(), + paths.train.conllu.open(encoding="utf8"), + paths.train.text.open(encoding="utf8"), max_doc_length=config.max_doc_length, limit=limit, ) - optimizer = initialize_pipeline(nlp, docs, golds, config) + optimizer = initialize_pipeline(nlp, examples, config) for i in range(config.nr_epoch): - docs = [nlp.make_doc(doc.text) for doc in docs] - batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size) + docs = [nlp.make_doc(example.doc.text) for example in examples] + batches = minibatch_by_words(examples, size=config.batch_size) losses = {} n_train_words = sum(len(doc) for doc in docs) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: - batch_docs, batch_gold = zip(*batch) - pbar.update(sum(len(doc) for doc in batch_docs)) + pbar.update(sum(len(ex.doc) for ex in batch)) nlp.update( - batch_docs, - batch_gold, + examples=batch, sgd=optimizer, drop=config.dropout, losses=losses, diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py index 4bf7a008f..7561d4877 100644 --- a/examples/training/ner_multitask_objective.py +++ b/examples/training/ner_multitask_objective.py @@ -31,14 +31,13 @@ random.seed(0) PWD = os.path.dirname(__file__) -TRAIN_DATA = list(read_json_file( - os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json"))) +TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json"))) -def get_position_label(i, words, tags, heads, labels, ents): +def get_position_label(i, token_annotation): """Return labels indicating the position of the word in the document. """ - if len(words) < 20: + if len(token_annotation.words) < 20: return "short-doc" elif i == 0: return "first-word" @@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents): return "early-word" elif i < 20: return "mid-word" - elif i == len(words) - 1: + elif i == len(token_annotation.words) - 1: return "last-word" else: return "late-word" @@ -60,17 +59,17 @@ def main(n_iter=10): print(nlp.pipeline) print("Create data", len(TRAIN_DATA)) - optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) + optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annot_brackets in TRAIN_DATA: - for annotations, _ in annot_brackets: - doc = Doc(nlp.vocab, words=annotations[1]) - gold = GoldParse.from_annot_tuples(doc, annotations) + for example in TRAIN_DATA: + for token_annotation in example.token_annotations: + doc = Doc(nlp.vocab, words=token_annotation.words) + gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation) + nlp.update( - [doc], # batch of texts - [gold], # batch of annotations + examples=[(doc, gold)], # 1 example drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses, @@ -78,9 +77,9 @@ def main(n_iter=10): print(losses.get("nn_labeller", 0.0), losses["ner"]) # test the trained model - for text, _ in TRAIN_DATA: - if text is not None: - doc = nlp(text) + for example in TRAIN_DATA: + if example.text is not None: + doc = nlp(example.text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index e45f3345e..828479881 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -116,7 +116,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter): losses = {} for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): docs = [nlp.make_doc(text) for text in batch] - tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout) + tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout) print(losses) return optimizer @@ -147,8 +147,7 @@ def train_textcat(nlp, n_texts, n_iter=10): # batch up the examples using spaCy's minibatch batches = minibatch(tqdm.tqdm(train_data), size=2) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) + nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 123f5049d..b08ba9f9a 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -74,8 +74,7 @@ def main(model_name, unlabelled_loc): # batch up the examples using spaCy's minibatch raw_batches = minibatch(raw_docs, size=4) for batch in minibatch(TRAIN_DATA, size=sizes): - docs, golds = zip(*batch) - nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses) + nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses) raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) print("Losses", losses) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index d2b2c2417..9d7357b8c 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -108,10 +108,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - texts, # batch of texts - annotations, # batch of annotations + batch, drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index 08d06bd4c..beb39fa1d 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -133,8 +133,7 @@ def main(model=None, output_dir=None, n_iter=15): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 49c25654c..e83d5cd0d 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -67,10 +67,8 @@ def main(model=None, output_dir=None, n_iter=100): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - texts, # batch of texts - annotations, # batch of annotations + batch, drop=0.5, # dropout - make it harder to memorise data losses=losses, ) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index e3a76f0c0..7fe443fc2 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -104,8 +104,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) + nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index aa60af00b..6db8af854 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -74,8 +74,7 @@ def main(model=None, output_dir=None, n_iter=15): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 7136273b3..06e05f6cd 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25): # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 4d4ebf396..128773c0a 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -82,8 +82,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) + nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 8f2900a9b..43216c943 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re +from spacy.gold import Example from ...gold import iob_to_biluo @@ -19,21 +20,21 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): # by @katarkor docs = [] sentences = [] - conll_tuples = read_conllx(input_data, use_morphology=use_morphology) + conll_data = read_conllx(input_data, use_morphology=use_morphology) checked_for_ner = False has_ner_tags = False - for i, (raw_text, tokens) in enumerate(conll_tuples): - sentence, brackets = tokens[0] - if not checked_for_ner: - has_ner_tags = is_ner(sentence[5][0]) - checked_for_ner = True - sentences.append(generate_sentence(sentence, has_ner_tags)) - # Real-sized documents could be extracted using the comments on the - # conluu document - if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) - docs.append(doc) - sentences = [] + for i, example in enumerate(conll_data): + for token_annotation in example.token_annotations: + if not checked_for_ner: + has_ner_tags = is_ner(token_annotation.entities[0]) + checked_for_ner = True + sentences.append(generate_sentence(token_annotation, has_ner_tags)) + # Real-sized documents could be extracted using the comments on the + # conluu document + if len(sentences) % n_sents == 0: + doc = create_doc(sentences, i) + docs.append(doc) + sentences = [] return docs @@ -52,15 +53,15 @@ def is_ner(tag): def read_conllx(input_data, use_morphology=False, n=0): + """ Yield example data points, one for each sentence """ i = 0 for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - tokens = [] + ids, words, tags, heads, deps, ents = [], [], [], [], [], [] for line in lines: - parts = line.split("\t") id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts if "-" in id_ or "." in id_: @@ -72,14 +73,22 @@ def read_conllx(input_data, use_morphology=False, n=0): tag = pos if tag == "_" else tag tag = tag + "__" + morph if use_morphology else tag iob = iob if iob else "O" - tokens.append((id_, word, tag, head, dep, iob)) + + ids.append(id_) + words.append(word) + tags.append(tag) + heads.append(head) + deps.append(dep) + ents.append(iob) except: # noqa: E722 print(line) raise - tuples = [list(t) for t in zip(*tokens)] - yield (None, [[tuples, []]]) + example = Example(doc=None) + example.add_token_annotation(ids=ids, words=words, tags=tags, + heads=heads, deps=deps, entities=ents) + yield example i += 1 - if n >= 1 and i >= n: + if 1 <= n <= i: break @@ -107,20 +116,19 @@ def simplify_tags(iob): return new_iob -def generate_sentence(sent, has_ner_tags): - (id_, word, tag, head, dep, iob) = sent +def generate_sentence(token_annotation, has_ner_tags): sentence = {} tokens = [] if has_ner_tags: - iob = simplify_tags(iob) + iob = simplify_tags(token_annotation.entities) biluo = iob_to_biluo(iob) - for i, id in enumerate(id_): + for i, id in enumerate(token_annotation.ids): token = {} token["id"] = id - token["orth"] = word[i] - token["tag"] = tag[i] - token["head"] = head[i] - id - token["dep"] = dep[i] + token["orth"] = token_annotation.words[i] + token["tag"] = token_annotation.tags[i] + token["head"] = token_annotation.heads[i] - id + token["dep"] = token_annotation.deps[i] if has_ner_tags: token["ner"] = biluo[i] tokens.append(token) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 8161ddf45..76276ee56 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -80,16 +80,16 @@ def debug_data( with msg.loading("Loading corpus..."): corpus = GoldCorpus(train_path, dev_path) try: - train_docs = list(corpus.train_docs(nlp)) - train_docs_unpreprocessed = list( - corpus.train_docs_without_preprocessing(nlp) + train_dataset = list(corpus.train_dataset(nlp)) + train_dataset_unpreprocessed = list( + corpus.train_dataset_without_preprocessing(nlp) ) except ValueError as e: loading_train_error_message = "Training data cannot be loaded: {}".format( str(e) ) try: - dev_docs = list(corpus.dev_docs(nlp)) + dev_dataset = list(corpus.dev_dataset(nlp)) except ValueError as e: loading_dev_error_message = "Development data cannot be loaded: {}".format( str(e) @@ -102,10 +102,10 @@ def debug_data( sys.exit(1) msg.good("Corpus is loadable") - # Create all gold data here to avoid iterating over the train_docs constantly - gold_train_data = _compile_gold(train_docs, pipeline) - gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) - gold_dev_data = _compile_gold(dev_docs, pipeline) + # Create all gold data here to avoid iterating over the train_dataset constantly + gold_train_data = _compile_gold(train_dataset, pipeline) + gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline) + gold_dev_data = _compile_gold(dev_dataset, pipeline) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] @@ -118,19 +118,19 @@ def debug_data( msg.text("Starting with base model '{}'".format(base_model)) else: msg.text("Starting with blank model '{}'".format(lang)) - msg.text("{} training docs".format(len(train_docs))) - msg.text("{} evaluation docs".format(len(dev_docs))) + msg.text("{} training docs".format(len(train_dataset))) + msg.text("{} evaluation docs".format(len(gold_dev_data))) overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn("{} training examples also in evaluation data".format(overlap)) else: msg.good("No overlap between training and evaluation data") - if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: + if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD: text = "Low number of examples to train from a blank model ({})".format( - len(train_docs) + len(train_dataset) ) - if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: + if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) @@ -238,7 +238,7 @@ def debug_data( has_low_data_warning = True with msg.loading("Analyzing label distribution..."): - neg_docs = _get_examples_without_label(train_docs, label) + neg_docs = _get_examples_without_label(train_dataset, label) if neg_docs == 0: msg.warn( "No examples for texts WITHOUT new label '{}'".format(label) @@ -358,7 +358,7 @@ def debug_data( msg.info( "Found {} sentence{} with an average length of {:.1f} words.".format( gold_train_data["n_sents"], - "s" if len(train_docs) > 1 else "", + "s" if len(train_dataset) > 1 else "", gold_train_data["n_words"] / gold_train_data["n_sents"], ) ) @@ -536,7 +536,7 @@ def _load_file(file_path, msg): ) -def _compile_gold(train_docs, pipeline): +def _compile_gold(examples, pipeline): data = { "ner": Counter(), "cats": Counter(), @@ -553,7 +553,9 @@ def _compile_gold(train_docs, pipeline): "n_cats_multilabel": 0, "texts": set(), } - for doc, gold in train_docs: + for example in examples: + gold = example.gold + doc = example.doc valid_words = [x for x in gold.words if x is not None] data["words"].update(valid_words) data["n_words"] += len(valid_words) @@ -598,8 +600,8 @@ def _format_labels(labels, counts=False): def _get_examples_without_label(data, label): count = 0 - for doc, gold in data: - labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")] + for ex in data: + labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")] if label not in labels: count += 1 return count diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 1114ada08..e5b2d0f02 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -45,11 +45,11 @@ def evaluate( msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) - dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) + dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) begin = timer() - scorer = nlp.evaluate(dev_docs, verbose=False) + scorer = nlp.evaluate(dev_dataset, verbose=False) end = timer() - nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + nwords = sum(len(ex.doc) for ex in dev_dataset) results = { "Time": "%.2f s" % (end - begin), "Words": nwords, @@ -66,7 +66,7 @@ def evaluate( msg.table(results, title="Results") if displacy_path: - docs, golds = zip(*dev_docs) + docs = [ex.doc for ex in dev_dataset] render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index f7236f7de..59269cb85 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -14,6 +14,7 @@ from thinc.neural.util import prefer_gpu from wasabi import Printer import srsly +from spacy.gold import Example from ..errors import Errors from ..tokens import Doc from ..attrs import ID, HEAD @@ -221,7 +222,7 @@ def pretrain( skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( - util.minibatch_by_words(((text, None) for text in texts), size=batch_size) + util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size) ): docs, count = make_docs( nlp, diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 13fcae37f..24255437c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -236,7 +236,7 @@ def train( optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training - optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) + optimizer = nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) nlp._optimizer = None @@ -261,7 +261,7 @@ def train( "problem with two labels.".format(textcat_positive_label), exits=1, ) - train_docs = corpus.train_docs( + train_data = corpus.train_data( nlp, noise_level=noise_level, gold_preproc=gold_preproc, @@ -271,9 +271,9 @@ def train( train_labels = set() if textcat_multilabel: multilabel_found = False - for text, gold in train_docs: - train_labels.update(gold.cats.keys()) - if list(gold.cats.values()).count(1.0) != 1: + for ex in train_data: + train_labels.update(ex.gold.cats.keys()) + if list(ex.gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn( @@ -283,9 +283,9 @@ def train( "mutually-exclusive classes." ) if not textcat_multilabel: - for text, gold in train_docs: - train_labels.update(gold.cats.keys()) - if list(gold.cats.values()).count(1.0) != 1 and not base_model: + for ex in train_data: + train_labels.update(ex.gold.cats.keys()) + if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " @@ -341,7 +341,7 @@ def train( iter_since_best = 0 best_score = 0.0 for i in range(n_iter): - train_docs = corpus.train_docs( + train_data = corpus.train_data( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, @@ -357,13 +357,11 @@ def train( words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} - for batch in util.minibatch_by_words(train_docs, size=batch_sizes): + for batch in util.minibatch_by_words(train_data, size=batch_sizes): if not batch: continue - docs, golds = zip(*batch) nlp.update( - docs, - golds, + batch, sgd=optimizer, drop=next(dropout_rates), losses=losses, @@ -373,6 +371,7 @@ def train( # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) + docs = [ex.doc for ex in batch] if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) @@ -385,16 +384,16 @@ def train( for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width - dev_docs = list( - corpus.dev_docs( + dev_dataset = list( + corpus.dev_dataset( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) - nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + nwords = sum(len(ex.doc) for ex in dev_dataset) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) + scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None @@ -406,15 +405,15 @@ def train( for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width - dev_docs = list( - corpus.dev_docs( + dev_dataset = list( + corpus.dev_dataset( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) + scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" diff --git a/spacy/errors.py b/spacy/errors.py index c708f0a5b..d2898cf53 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -530,6 +530,12 @@ class Errors(object): "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.") + # TODO: fix numbering after merging develop into master + E998 = ("Can only create GoldParse's from Example's without a Doc, " + "if get_gold_parses() is called with a Vocab object.") + E999 = ("Encountered an unexpected format for the dictionary holding " + "gold annotations: {gold_dict}") + @add_codes class TempErrors(object): diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 20a25a939..6027d85b6 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -1,6 +1,6 @@ from cymem.cymem cimport Pool -from .structs cimport TokenC +from spacy.tokens import Doc from .typedefs cimport attr_t from .syntax.transition_system cimport Transition @@ -19,6 +19,7 @@ cdef class GoldParse: cdef Pool mem cdef GoldParseC c + cdef readonly TokenAnnotation orig cdef int length cdef public int loss @@ -29,13 +30,36 @@ cdef class GoldParse: cdef public list labels cdef public dict orths cdef public list ner - cdef public list ents cdef public dict brackets - cdef public object cats + cdef public dict cats cdef public dict links cdef readonly list cand_to_gold cdef readonly list gold_to_cand - cdef readonly list orig_annot + + +cdef class TokenAnnotation: + cdef public list ids + cdef public list words + cdef public list tags + cdef public list heads + cdef public list deps + cdef public list entities + cdef public list morphology + cdef public list brackets + + +cdef class DocAnnotation: + cdef public object cats + cdef public object links + + +cdef class Example: + cdef public object doc + cdef public list token_annotations + cdef public DocAnnotation doc_annotation + cdef public object make_projective + cdef public object ignore_misaligned + cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5aecc2584..ea3589ea5 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -14,11 +14,8 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span from .errors import Errors, AlignmentError -from .compat import path2str +from .compat import path2str, basestring_ from . import util -from .util import minibatch, itershuffle - -from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek USE_NEW_ALIGN = False @@ -54,25 +51,6 @@ def tags_to_entities(tags): return entities -def merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_cats = {} - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in brackets) - m_cats.update(cats) - i += len(ids) - return [(m_deps, (m_cats, m_brackets))] - - _ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] @@ -211,14 +189,14 @@ class GoldCorpus(object): def __init__(self, train, dev, gold_preproc=False, limit=None): """Create a GoldCorpus. - train_path (unicode or Path): File or directory of training data. - dev_path (unicode or Path): File or directory of development data. + train (unicode or Path): File or directory of training data. + dev (unicode or Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ self.limit = limit if isinstance(train, str) or isinstance(train, Path): - train = self.read_tuples(self.walk_corpus(train)) - dev = self.read_tuples(self.walk_corpus(dev)) + train = self.read_examples(self.walk_corpus(train)) + dev = self.read_examples(self.walk_corpus(dev)) # Write temp directory with one doc per file, so we can shuffle and stream self.tmp_dir = Path(tempfile.mkdtemp()) self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) @@ -228,13 +206,15 @@ class GoldCorpus(object): shutil.rmtree(path2str(self.tmp_dir)) @staticmethod - def write_msgpack(directory, doc_tuples, limit=0): + def write_msgpack(directory, examples, limit=0): if not directory.exists(): directory.mkdir() n = 0 - for i, doc_tuple in enumerate(doc_tuples): - srsly.write_msgpack(directory / "{}.msg".format(i), [doc_tuple]) - n += len(doc_tuple[1]) + for i, example in enumerate(examples): + ex_dict = example.to_dict() + text = example.text + srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) + n += len(example.token_annotations) if limit and n >= limit: break @@ -259,128 +239,144 @@ class GoldCorpus(object): return locs @staticmethod - def read_tuples(locs, limit=0): + def read_examples(locs, limit=0): + """ Yield training examples """ i = 0 for loc in locs: loc = util.ensure_path(loc) if loc.parts[-1].endswith("json"): - gold_tuples = read_json_file(loc) + examples = read_json_file(loc) elif loc.parts[-1].endswith("jsonl"): gold_tuples = srsly.read_jsonl(loc) first_gold_tuple = next(gold_tuples) gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) # TODO: proper format checks with schemas if isinstance(first_gold_tuple, dict): - gold_tuples = read_json_object(gold_tuples) + if first_gold_tuple.get("paragraphs", None): + examples = read_json_object(gold_tuples) + elif first_gold_tuple.get("doc_annotation", None): + examples = [] + for ex_dict in gold_tuples: + doc = ex_dict.get("doc", None) + if doc is None: + doc = ex_dict.get("text", None) + examples.append(Example.from_dict(ex_dict, doc=doc)) + elif loc.parts[-1].endswith("msg"): - gold_tuples = srsly.read_msgpack(loc) + text, ex_dict = srsly.read_msgpack(loc) + examples = [Example.from_dict(ex_dict, doc=text)] else: supported = ("json", "jsonl", "msg") raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) - for item in gold_tuples: - yield item - i += len(item[1]) + for example in examples: + yield example + i += len(example.token_annotations) if limit and i >= limit: return @property - def dev_tuples(self): + def dev_examples(self): locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_tuples(locs, limit=self.limit) + yield from self.read_examples(locs, limit=self.limit) @property - def train_tuples(self): + def train_examples(self): locs = (self.tmp_dir / "train").iterdir() - yield from self.read_tuples(locs, limit=self.limit) + yield from self.read_examples(locs, limit=self.limit) def count_train(self): + # TODO: should this count words or sentences ? n = 0 i = 0 - for raw_text, paragraph_tuples in self.train_tuples: - for sent_tuples, brackets in paragraph_tuples: - n += len(sent_tuples[1]) + for example in self.train_examples: + for token_annotation in example.token_annotations: + n += len(token_annotation.words) if self.limit and i >= self.limit: break i += 1 return n - def train_docs(self, nlp, gold_preproc=False, max_length=None, + def train_dataset(self, nlp, gold_preproc=False, max_length=None, noise_level=0.0, orth_variant_level=0.0, ignore_misaligned=False): locs = list((self.tmp_dir / 'train').iterdir()) random.shuffle(locs) - train_tuples = self.read_tuples(locs, limit=self.limit) - gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, + train_examples = self.read_examples(locs, limit=self.limit) + gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc, max_length=max_length, noise_level=noise_level, orth_variant_level=orth_variant_level, make_projective=True, ignore_misaligned=ignore_misaligned) - yield from gold_docs + yield from gold_examples - def train_docs_without_preprocessing(self, nlp, gold_preproc=False): - gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc) - yield from gold_docs + def train_dataset_without_preprocessing(self, nlp, gold_preproc=False): + examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc) + yield from examples - def dev_docs(self, nlp, gold_preproc=False, ignore_misaligned=False): - gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc, + def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): + examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc, ignore_misaligned=ignore_misaligned) - yield from gold_docs + yield from examples @classmethod - def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, + def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, noise_level=0.0, orth_variant_level=0.0, make_projective=False, ignore_misaligned=False): - for raw_text, paragraph_tuples in tuples: + """ Setting gold_preproc will result in creating a doc per 'sentence' """ + for example in examples: if gold_preproc: - raw_text = None + example.doc = None else: - paragraph_tuples = merge_sents(paragraph_tuples) - docs, paragraph_tuples = cls._make_docs(nlp, raw_text, - paragraph_tuples, gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - golds = cls._make_golds(docs, paragraph_tuples, make_projective, - ignore_misaligned=ignore_misaligned) - for doc, gold in zip(docs, golds): - if gold is not None: - if (not max_length) or len(doc) < max_length: - yield doc, gold + example = example.merge_sents() + example.make_projective = make_projective + example.ignore_misaligned = ignore_misaligned + examples = cls._make_docs(nlp, example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + examples = cls._make_golds(examples, vocab=nlp.vocab) + for ex in examples: + if ex.gold is not None: + if (not max_length) or len(ex.doc) < max_length: + yield ex @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0): - if raw_text is not None: - raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level) - raw_text = add_noise(raw_text, noise_level) - return [nlp.make_doc(raw_text)], paragraph_tuples + def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0): + # gold_preproc is not used ?! + if example.text is not None: + var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) + var_text = add_noise(var_example.text, noise_level) + var_doc = nlp.make_doc(var_text) + var_example.doc = var_doc + return [var_example] else: - docs = [] - raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level) - return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples - + var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) + doc_examples = [] + for token_annotation in var_example.token_annotations: + t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) + doc_example = Example(doc_annotation=example.doc_annotation, + token_annotations=[token_annotation], + doc=t_doc) + doc_examples.append(doc_example) + return doc_examples @classmethod - def _make_golds(cls, docs, paragraph_tuples, make_projective, ignore_misaligned=False): - if len(docs) != len(paragraph_tuples): - n_annots = len(paragraph_tuples) - raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots)) - golds = [] - for doc, (sent_tuples, (cats, brackets)) in zip(docs, paragraph_tuples): - try: - gold = GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats, - make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gold = None - else: - raise - golds.append(gold) - return golds + def _make_golds(cls, examples, vocab=None): + gold_examples = [] + for example in examples: + gold_parses = example.get_gold_parses(vocab=vocab) + for (doc, gold) in gold_parses: + ex = Example(doc=doc) + ex.goldparse = gold + gold_examples.append(ex) + return gold_examples - -def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): +def make_orth_variants(nlp, example, orth_variant_level=0.0): if random.random() >= orth_variant_level: - return raw, paragraph_tuples + return example + if not example.token_annotations: + return example + raw = example.text if random.random() >= 0.5: lower = True if raw is not None: @@ -388,38 +384,47 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): ndsv = nlp.Defaults.single_orth_variants ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples - variant_paragraph_tuples = [] - for sent_tuples, brackets in paragraph_tuples: - ids, words, tags, heads, labels, ner = sent_tuples - if lower: - words = [w.lower() for w in words] - # single variants - punct_choices = [random.choice(x["variants"]) for x in ndsv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: - words[word_idx] = punct_choices[punct_idx] - # paired variants - punct_choices = [random.choice(x["variants"]) for x in ndpv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): - # backup option: random left vs. right from pair - pair_idx = random.choice([0, 1]) - # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) - # next best option: rely on position in variants - # (may not be unambiguous, so order of variants matters) - else: - for pair in ndpv[punct_idx]["variants"]: - if words[word_idx] in pair: - pair_idx = pair.index(words[word_idx]) - words[word_idx] = punct_choices[punct_idx][pair_idx] + variant_example = Example(doc=raw) + for token_annotation in example.token_annotations: + words = token_annotation.words + tags = token_annotation.tags + if not words or not tags: + # add the unmodified annotation + token_dict = token_annotation.to_dict() + variant_example.add_token_annotation(**token_dict) + else: + if lower: + words = [w.lower() for w in words] + # single variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if tags[word_idx] in ndsv[punct_idx]["tags"] \ + and words[word_idx] in ndsv[punct_idx]["variants"]: + words[word_idx] = punct_choices[punct_idx] + # paired variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] \ + and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] - variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) + token_dict = token_annotation.to_dict() + token_dict["words"] = words + token_dict["tags"] = tags + variant_example.add_token_annotation(**token_dict) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] @@ -437,9 +442,8 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] raw_idx += 1 - for sent_tuples, brackets in variant_paragraph_tuples: - ids, words, tags, heads, labels, ner = sent_tuples - for word in words: + for token_annotation in variant_example.token_annotations: + for word in token_annotation.words: match_found = False # add identical word if word not in variants and raw[raw_idx:].startswith(word): @@ -457,13 +461,14 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: - return raw, paragraph_tuples + return example # add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] raw_idx += 1 - return variant_raw, variant_paragraph_tuples - return raw, variant_paragraph_tuples + variant_example.doc = variant_raw + return variant_example + return variant_example def add_noise(orig, noise_level): @@ -488,30 +493,27 @@ def _corrupt(c, noise_level): def read_json_object(json_corpus_section): """Take a list of JSON-formatted documents (e.g. from an already loaded - training data file) and yield tuples in the GoldParse format. + training data file) and yield annotations in the GoldParse format. json_corpus_section (list): The data. - YIELDS (tuple): The reformatted data. + YIELDS (Example): The reformatted data - one training example per paragraph """ for json_doc in json_corpus_section: - tuple_doc = json_to_tuple(json_doc) - for tuple_paragraph in tuple_doc: - yield tuple_paragraph + examples = json_to_examples(json_doc) + for ex in examples: + yield ex -def json_to_tuple(doc): - """Convert an item in the JSON-formatted training data to the tuple format +def json_to_examples(doc): + """Convert an item in the JSON-formatted training data to the format used by GoldParse. doc (dict): One entry in the training data. - YIELDS (tuple): The reformatted data. + YIELDS (Example): The reformatted data - one training example per paragraph """ paragraphs = [] for paragraph in doc["paragraphs"]: - sents = [] - cats = {} - for cat in paragraph.get("cats", {}): - cats[cat["label"]] = cat["value"] + example = Example(doc=paragraph.get("raw", None)) for sent in paragraph["sentences"]: words = [] ids = [] @@ -529,11 +531,14 @@ def json_to_tuple(doc): if labels[-1].lower() == "root": labels[-1] = "ROOT" ner.append(token.get("ner", "-")) - sents.append([ - [ids, words, tags, heads, labels, ner], - [cats, sent.get("brackets", [])]]) - if sents: - yield [paragraph.get("raw", None), sents] + example.add_token_annotation(ids=ids, words=words, tags=tags, + heads=heads, deps=labels, entities=ner, + brackets=sent.get("brackets", [])) + cats = {} + for cat in paragraph.get("cats", {}): + cats[cat["label"]] = cat["value"] + example.add_doc_annotation(cats=cats) + yield example def read_json_file(loc, docs_filter=None, limit=None): @@ -545,8 +550,8 @@ def read_json_file(loc, docs_filter=None, limit=None): for doc in _json_iterate(loc): if docs_filter is not None and not docs_filter(doc): continue - for json_tuple in json_to_tuple(doc): - yield json_tuple + for json_data in json_to_examples(doc): + yield json_data def _json_iterate(loc): @@ -639,21 +644,254 @@ def _consume_ent(tags): return [start] + middle + [end] +cdef class TokenAnnotation: + def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None): + self.ids = ids if ids else [] + self.words = words if words else [] + self.tags = tags if tags else [] + self.heads = heads if heads else [] + self.deps = deps if deps else [] + self.entities = entities if entities else [] + self.brackets = brackets if brackets else [] + self.morphology = morphology if morphology else [] + + @classmethod + def from_dict(cls, token_dict): + return cls(ids=token_dict.get("ids", None), + words=token_dict.get("words", None), + tags=token_dict.get("tags", None), + heads=token_dict.get("heads", None), + deps=token_dict.get("deps", None), + entities=token_dict.get("entities", None), + morphology=token_dict.get("morphology", None), + brackets=token_dict.get("brackets", None)) + + def to_dict(self): + return {"ids": self.ids, + "words": self.words, + "tags": self.tags, + "heads": self.heads, + "deps": self.deps, + "entities": self.entities, + "morphology": self.morphology, + "brackets": self.brackets} + + +cdef class DocAnnotation: + def __init__(self, cats=None, links=None): + self.cats = cats if cats else {} + self.links = links if links else {} + + @classmethod + def from_dict(cls, doc_dict): + return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None)) + + def to_dict(self): + return {"cats": self.cats, "links": self.links} + + +cdef class Example: + def __init__(self, doc_annotation=None, token_annotations=None, doc=None, + make_projective=False, ignore_misaligned=False, goldparse=None): + """ Doc can either be text, or an actual Doc """ + self.doc = doc + self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() + self.token_annotations = token_annotations if token_annotations else [] + self.make_projective = make_projective + self.ignore_misaligned = ignore_misaligned + self.goldparse = goldparse + + @classmethod + def from_gold(cls, goldparse, doc=None): + doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) + token_annotation = goldparse.get_token_annotation() + return cls(doc_annotation, [token_annotation], doc) + + @classmethod + def from_dict(cls, example_dict, doc=None): + token_dicts = example_dict["token_annotations"] + token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts] + doc_dict = example_dict["doc_annotation"] + doc_annotation = DocAnnotation.from_dict(doc_dict) + return cls(doc_annotation, token_annotations, doc) + + def to_dict(self): + """ Note that this method does NOT export the doc, only the annotations ! """ + token_dicts = [t.to_dict() for t in self.token_annotations] + doc_dict = self.doc_annotation.to_dict() + return {"token_annotations": token_dicts, "doc_annotation": doc_dict} + + @property + def text(self): + if self.doc is None: + return None + if isinstance(self.doc, Doc): + return self.doc.text + return self.doc + + @property + def gold(self): + if self.goldparse is None: + doc, gold = self.get_gold_parses(merge=True)[0] + self.goldparse = gold + return self.goldparse + + def add_token_annotation(self, ids=None, words=None, tags=None, heads=None, + deps=None, entities=None, morphology=None, brackets=None): + t = TokenAnnotation(ids=ids, words=words, tags=tags, + heads=heads, deps=deps, entities=entities, + morphology=morphology, brackets=brackets) + self.token_annotations.append(t) + + def add_doc_annotation(self, cats=None, links=None): + if cats: + self.doc_annotation.cats.update(cats) + if links: + self.doc_annotation.links.update(links) + + def merge_sents(self): + """ Merge the list of token annotations into one object and return this new object """ + m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation) + m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], [] + m_brackets = [] + i = 0 + for t in self.token_annotations: + m_ids.extend(id_ + i for id_ in t.ids) + m_words.extend(t.words) + m_tags.extend(t.tags) + m_heads.extend(head + i if head else None for head in t.heads) + m_deps.extend(t.deps) + m_ents.extend(t.entities) + m_morph.extend(t.morphology) + m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) + for b in t.brackets) + i += len(t.ids) + m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags, + heads=m_heads, deps=m_deps, entities=m_ents, + morphology=m_morph, brackets=m_brackets) + return m_example + + + def get_gold_parses(self, merge=False, vocab=None): + """Return a list of (doc, GoldParse) objects. + If merge is set to True, add all Token annotations to one big list.""" + d = self.doc_annotation + # merging different sentences + if merge: + merged_example = self.merge_sents() + assert(len(merged_example.token_annotations)) == 1 + t = merged_example.token_annotations[0] + m_doc = merged_example.doc + if not m_doc: + if not vocab: + raise ValueError(Errors.E998) + m_doc = Doc(vocab, words=t.words) + try: + gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective) + except AlignmentError: + if self.ignore_misaligned: + gp = None + else: + raise + return [(self.doc, gp)] + # we only have one sentence and an appropriate doc + elif len(self.token_annotations) == 1 and self.doc is not None: + t = self.token_annotations[0] + try: + gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective) + except AlignmentError: + if self.ignore_misaligned: + gp = None + else: + raise + return [(self.doc, gp)] + # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence + else: + parses = [] + for t in self.token_annotations: + if not vocab: + raise ValueError(Errors.E998) + t_doc = Doc(vocab, words=t.words) + try: + gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective) + except AlignmentError: + if self.ignore_misaligned: + gp = None + else: + raise + if gp is not None: + parses.append((t_doc, gp)) + return parses + + @classmethod + def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): + """ + Return a list of Example objects, from a variety of input formats. + make_doc needs to be provided when the examples contain text strings and keep_raw_text=False + """ + if isinstance(examples, Example): + return [examples] + if isinstance(examples, tuple): + examples = [examples] + converted_examples = [] + for ex in examples: + # convert string to Doc to Example + if isinstance(ex, basestring_): + if keep_raw_text: + converted_examples.append(Example(doc=ex)) + else: + doc = make_doc(ex) + converted_examples.append(Example(doc=doc)) + # convert Doc to Example + elif isinstance(ex, Doc): + converted_examples.append(Example(doc=ex)) + # convert tuples to Example + elif isinstance(ex, tuple) and len(ex) == 2: + doc, gold = ex + gold_dict = {} + # convert string to Doc + if isinstance(doc, basestring_) and not keep_raw_text: + doc = make_doc(doc) + # convert dict to GoldParse + if isinstance(gold, dict): + gold_dict = gold + if doc is not None or gold.get("words", None) is not None: + gold = GoldParse(doc, **gold) + else: + gold = None + if gold is not None: + converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) + else: + raise ValueError(Errors.E999.format(gold_dict=gold_dict)) + else: + converted_examples.append(ex) + return converted_examples + + cdef class GoldParse: """Collection for training annotations. DOCS: https://spacy.io/api/goldparse """ @classmethod - def from_annot_tuples(cls, doc, annot_tuples, cats=None, make_projective=False): - _, words, tags, heads, deps, entities = annot_tuples - return cls(doc, words=words, tags=tags, heads=heads, deps=deps, - entities=entities, cats=cats, + def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): + return cls(doc, words=token_annotation.words, tags=token_annotation.tags, + heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities, + morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, + def get_token_annotation(self): + ids = None + if self.words: + ids = list(range(len(self.words))) + + return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, + heads=self.heads, deps=self.labels, entities=self.ner, + morphology=self.morphology) + + def __init__(self, doc, words=None, tags=None, morphology=None, heads=None, deps=None, entities=None, make_projective=False, - cats=None, links=None, **_): + cats=None, links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. doc (Doc): The document the annotations refer to. @@ -688,19 +926,19 @@ cdef class GoldParse: self.length = len(doc) self.cats = {} if cats is None else dict(cats) - self.links = links + self.links = {} if links is None else dict(links) # avoid allocating memory if the doc does not contain any tokens if self.length > 0: - if words is None: + if not words: words = [token.text for token in doc] - if tags is None: + if not tags: tags = [None for _ in words] - if heads is None: + if not heads: heads = [None for _ in words] - if deps is None: + if not deps: deps = [None for _ in words] - if morphology is None: + if not morphology: morphology = [None for _ in words] if entities is None: entities = ["-" for _ in words] @@ -710,7 +948,7 @@ cdef class GoldParse: # Translate the None values to '-', to make processing easier. # See Issue #2603 entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], basestring): + if not isinstance(entities[0], basestring_): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -745,8 +983,9 @@ cdef class GoldParse: self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - annot_tuples = (range(len(words)), words, tags, heads, deps, entities) - self.orig_annot = list(zip(*annot_tuples)) + self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, + heads=heads, deps=deps, entities=entities, morphology=morphology, + brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): if doc[i].text.isspace(): diff --git a/spacy/language.py b/spacy/language.py index d53710f58..3106c6afe 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,6 +3,8 @@ from __future__ import absolute_import, unicode_literals import random import itertools + +from spacy.gold import Example from spacy.util import minibatch import weakref import functools @@ -409,7 +411,7 @@ class Language(object): def __call__(self, text, disable=[], component_cfg=None): """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string + and can contain arbitrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. @@ -452,30 +454,10 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def _format_docs_and_golds(self, docs, golds): - """Format golds and docs before update models.""" - expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links") - gold_objs = [] - doc_objs = [] - for doc, gold in zip(docs, golds): - if isinstance(doc, basestring_): - doc = self.make_doc(doc) - if not isinstance(gold, GoldParse): - unexpected = [k for k in gold if k not in expected_keys] - if unexpected: - err = Errors.E151.format(unexp=unexpected, exp=expected_keys) - raise ValueError(err) - gold = GoldParse(doc, **gold) - doc_objs.append(doc) - gold_objs.append(gold) - - return doc_objs, gold_objs - - def update(self, docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None): + def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None): """Update the models in the pipeline. - docs (iterable): A batch of `Doc` objects. - golds (iterable): A batch of `GoldParse` objects. + examples (iterable): A batch of `Example` or `Doc` objects. drop (float): The dropout rate. sgd (callable): An optimizer. losses (dict): Dictionary to update with the loss, keyed by component. @@ -484,18 +466,16 @@ class Language(object): DOCS: https://spacy.io/api/language#update """ - if len(docs) != len(golds): - raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds))) - if len(docs) == 0: + if len(examples) == 0: return + examples = Example.to_example_objects(examples, make_doc=self.make_doc) + if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer(Model.ops) sgd = self._optimizer - # Allow dict of args to GoldParse, instead of GoldParse objects. - docs, golds = self._format_docs_and_golds(docs, golds) - grads = {} + grads = {} def get_grads(W, dW, key=None): grads[key] = (W, dW) @@ -512,18 +492,18 @@ class Language(object): grads = {} kwargs = component_cfg.get(name, {}) kwargs.setdefault("drop", drop) - proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs) + proc.update(examples, sgd=get_grads, losses=losses, **kwargs) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) - def rehearse(self, docs, sgd=None, losses=None, config=None): + def rehearse(self, examples, sgd=None, losses=None, config=None): """Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some data, and update the model so its current predictions are more like the initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples. - docs (iterable): A batch of `Doc` objects. + examples (iterable): A batch of `Doc` objects. drop (float): The dropout rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. @@ -531,22 +511,18 @@ class Language(object): EXAMPLE: >>> raw_text_batches = minibatch(raw_texts) >>> for labelled_batch in minibatch(zip(train_docs, train_golds)): - >>> docs, golds = zip(*train_docs) - >>> nlp.update(docs, golds) + >>> nlp.update(labelled_batch) >>> raw_batch = [nlp.make_doc(text) for text in next(raw_text_batches)] >>> nlp.rehearse(raw_batch) """ # TODO: document - if len(docs) == 0: + if len(examples) == 0: return + examples = Example.to_example_objects(examples, make_doc=self.make_doc) if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer(Model.ops) sgd = self._optimizer - docs = list(docs) - for i, doc in enumerate(docs): - if isinstance(doc, basestring_): - docs[i] = self.make_doc(doc) pipes = list(self.pipeline) random.shuffle(pipes) if config is None: @@ -563,44 +539,45 @@ class Language(object): if not hasattr(proc, "rehearse"): continue grads = {} - proc.rehearse(docs, sgd=get_grads, losses=losses, **config.get(name, {})) + proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {})) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) return losses - def preprocess_gold(self, docs_golds): + def preprocess_gold(self, examples): """Can be called before training to pre-process gold data. By default, it handles nonprojectivity and adds missing tags to the tag map. - docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. - YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. + examples (iterable): `Example` objects. + YIELDS (tuple): `Example` objects. """ for name, proc in self.pipeline: if hasattr(proc, "preprocess_gold"): - docs_golds = proc.preprocess_gold(docs_golds) - for doc, gold in docs_golds: - yield doc, gold + examples = proc.preprocess_gold(examples) + for ex in examples: + yield ex - def begin_training(self, get_gold_tuples=None, sgd=None, component_cfg=None, **cfg): + def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. - get_gold_tuples (function): Function returning gold data + get_examples (function): Function returning example training data (TODO: document format change since 3.0) component_cfg (dict): Config parameters for specific components. **cfg: Config parameters. RETURNS: An optimizer. DOCS: https://spacy.io/api/language#begin_training """ - if get_gold_tuples is None: - get_gold_tuples = lambda: [] + # TODO: throw warning when get_gold_tuples is provided instead of get_examples + if get_examples is None: + get_examples = lambda: [] # Populate vocab else: - for _, annots_brackets in get_gold_tuples(): - _ = annots_brackets.pop() - for annots, _ in annots_brackets: - for word in annots[1]: + for example in get_examples(): + for token_annotation in example.token_annotations: + for word in token_annotation.words: _ = self.vocab[word] # noqa: F841 + if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) if self.vocab.vectors.data.shape[1] >= 1: @@ -618,7 +595,7 @@ class Language(object): kwargs = component_cfg.get(name, {}) kwargs.update(cfg) proc.begin_training( - get_gold_tuples, + get_examples, pipeline=self.pipeline, sgd=self._optimizer, **kwargs @@ -650,11 +627,11 @@ class Language(object): return self._optimizer def evaluate( - self, docs_golds, verbose=False, batch_size=256, scorer=None, component_cfg=None + self, examples, verbose=False, batch_size=256, scorer=None, component_cfg=None ): """Evaluate a model's pipeline components. - docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. + examples (iterable): `Example` objects. verbose (bool): Print debugging information. batch_size (int): Batch size to use. scorer (Scorer): Optional `Scorer` to use. If not passed in, a new one @@ -665,30 +642,24 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ + examples = Example.to_example_objects(examples, make_doc=self.make_doc) if scorer is None: scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: component_cfg = {} - docs, golds = zip(*docs_golds) - docs = [ - self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs - ] - golds = list(golds) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) if not hasattr(pipe, "pipe"): - docs = _pipe(pipe, docs, kwargs) + examples = _pipe(pipe, examples, kwargs) else: - docs = pipe.pipe(docs, **kwargs) - for doc, gold in zip(docs, golds): - if not isinstance(gold, GoldParse): - gold = GoldParse(doc, **gold) + examples = pipe.pipe(examples, as_example=True, **kwargs) + for ex in examples: if verbose: - print(doc) + print(ex.doc) kwargs = component_cfg.get("scorer", {}) kwargs.setdefault("verbose", verbose) - scorer.score(doc, gold, **kwargs) + scorer.score(ex, **kwargs) return scorer @contextmanager @@ -733,6 +704,7 @@ class Language(object): cleanup=False, component_cfg=None, n_process=1, + as_example=False ): """Process texts as a stream, and yield `Doc` objects in order. @@ -770,6 +742,7 @@ class Language(object): batch_size=batch_size, disable=disable, component_cfg=component_cfg, + as_example=False ) for doc, context in izip(docs, contexts): yield (doc, context) @@ -1095,15 +1068,15 @@ class DisabledPipes(list): self[:] = [] -def _pipe(docs, proc, kwargs): +def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) for arg in ["n_threads", "batch_size"]: if arg in kwargs: kwargs.pop(arg) - for doc in docs: - doc = proc(doc, **kwargs) - yield doc + for ex in examples: + ex = proc(ex, **kwargs) + yield ex def _apply_pipes(make_doc, pipes, reciever, sender): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 72e31f120..adcff9280 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -97,18 +97,19 @@ class Morphologizer(Pipe): if doc[j].morph.pos != 0: doc.c[j].pos = doc[j].morph.pos - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: losses[self.name] = 0. + docs = [self._get_doc(ex) for ex in examples] tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) - loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) + loss, d_tag_scores = self.get_loss(examples, tag_scores) bp_tag_scores(d_tag_scores, sgd=sgd) if losses is not None: losses[self.name] += loss - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): guesses = [] for doc_scores in scores: guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) @@ -122,7 +123,9 @@ class Morphologizer(Pipe): # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') field_sizes = self.model.softmax.out_sizes - for doc, gold in zip(docs, golds): + for example in examples: + doc = example.doc + gold = example.gold for t, features in enumerate(gold.morphology): if features is None: target[idx] = scores[idx] @@ -146,6 +149,7 @@ class Morphologizer(Pipe): scores = self.model.ops.asarray(scores, dtype='f') d_scores = scores - target loss = (d_scores**2).sum() + docs = [self._get_doc(ex) for ex in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d29cf9ce9..1d67d8e16 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -13,6 +13,7 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module +from spacy.gold import Example from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -59,11 +60,17 @@ class Pipe(object): def from_nlp(cls, nlp, **cfg): return cls(nlp.vocab, **cfg) + def _get_doc(self, example): + """ Use this method if the `example` method can be both a Doc or an Example """ + if isinstance(example, Doc): + return example + return example.doc + def __init__(self, vocab, model=True, **cfg): """Create a new pipe instance.""" raise NotImplementedError - def __call__(self, doc): + def __call__(self, example): """Apply the pipe to one document. The document is modified in-place, and returned. @@ -71,12 +78,16 @@ class Pipe(object): and `set_annotations()` methods. """ self.require_model() + doc = self._get_doc(example) predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: scores, tensors = predictions self.set_annotations([doc], scores, tensors=tensors) else: self.set_annotations([doc], predictions) + if isinstance(example, Example): + example.doc = doc + return example return doc def require_model(self): @@ -84,21 +95,30 @@ class Pipe(object): if getattr(self, "model", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + for examples in util.minibatch(stream, size=batch_size): + examples = list(examples) + docs = [self._get_doc(ex) for ex in examples] predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - yield from docs + + if as_example: + examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + examples.append(ex) + yield from examples + else: + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without @@ -111,7 +131,7 @@ class Pipe(object): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, docs, golds, drop=0.0, sgd=None, losses=None): + def update(self, examples, drop=0.0, sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, updating the pipe's model. @@ -119,12 +139,12 @@ class Pipe(object): """ pass - def rehearse(self, docs, sgd=None, losses=None, **config): + def rehearse(self, examples, sgd=None, losses=None, **config): pass - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of - documents and their predicted scores.""" + examples (with embedded docs) and their predicted scores.""" raise NotImplementedError def add_label(self, label): @@ -140,7 +160,7 @@ class Pipe(object): return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {})) def begin_training( - self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs + self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs ): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" @@ -264,29 +284,41 @@ class Tensorizer(Pipe): self.cfg = dict(cfg) self.cfg.setdefault("cnn_maxout_pieces", 3) - def __call__(self, doc): + def __call__(self, example): """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM model. Vectors are set to the `Doc.tensor` attribute. docs (Doc or iterable): One or more documents to add vectors to. RETURNS (dict or None): Intermediate computations. """ + doc = self._get_doc(example) tokvecses = self.predict([doc]) self.set_annotations([doc], tokvecses) + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): """Process `Doc` objects as a stream. - stream (iterator): A sequence of `Doc` objects to process. - batch_size (int): Number of `Doc` objects to group. - YIELDS (iterator): A sequence of `Doc` objects, in order of input. + stream (iterator): A sequence of `Doc` or `Example` objects to process. + batch_size (int): Number of `Doc` or `Example` objects to group. + YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input. """ - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] tensors = self.predict(docs) self.set_annotations(docs, tensors) - yield from docs + + if as_example: + examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + examples.append(ex) + yield from examples + else: + yield from docs def predict(self, docs): """Return a single tensor for a batch of documents. @@ -310,7 +342,7 @@ class Tensorizer(Pipe): raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) doc.tensor = tensor - def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): + def update(self, examples, state=None, drop=0.0, sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. @@ -320,17 +352,16 @@ class Tensorizer(Pipe): RETURNS (dict): Results from the update. """ self.require_model() - if isinstance(docs, Doc): - docs = [docs] + examples = Example.to_example_objects(examples) inputs = [] bp_inputs = [] for tok2vec in self.input_models: - tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop) + tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples], drop=drop) inputs.append(tensor) bp_inputs.append(bp_tensor) inputs = self.model.ops.xp.hstack(inputs) scores, bp_scores = self.model.begin_update(inputs, drop=drop) - loss, d_scores = self.get_loss(docs, golds, scores) + loss, d_scores = self.get_loss(examples, scores) d_inputs = bp_scores(d_scores, sgd=sgd) d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) for d_input, bp_input in zip(d_inputs, bp_inputs): @@ -340,18 +371,19 @@ class Tensorizer(Pipe): losses[self.name] += loss return loss - def get_loss(self, docs, golds, prediction): - ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + def get_loss(self, examples, prediction): + examples = Example.to_example_objects(examples) + ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) target = self.vocab.vectors.data[ids] d_scores = (prediction - target) / prediction.shape[0] loss = (d_scores ** 2).sum() return loss, d_scores - def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): """Allocate models, pre-process training data and acquire an optimizer. - gold_tuples (iterable): Gold-standard training data. + get_examples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. """ if pipeline is not None: @@ -391,17 +423,30 @@ class Tagger(Pipe): else: return chain(self.model.tok2vec, flatten) - def __call__(self, doc): + def __call__(self, example): + doc = self._get_doc(example) tags, tokvecs = self.predict([doc]) self.set_annotations([doc], tags, tensors=tokvecs) + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + examples = list(examples) + docs = [self._get_doc(ex) for ex in examples] tag_ids, tokvecs = self.predict(docs) self.set_annotations(docs, tag_ids, tensors=tokvecs) - yield from docs + + if as_example: + examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + examples.append(ex) + yield from examples + else: + yield from docs def predict(self, docs): self.require_model() @@ -452,47 +497,51 @@ class Tagger(Pipe): doc.extend_tensor(tensors[i]) doc.is_tagged = True - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None): self.require_model() + examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. - if not any(len(doc) for doc in docs): + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) - loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + loss, d_tag_scores = self.get_loss(examples, tag_scores) bp_tag_scores(d_tag_scores, sgd=sgd) if losses is not None: losses[self.name] += loss - def rehearse(self, docs, drop=0., sgd=None, losses=None): + def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of an initial model. """ if self._rehearsal_model is None: return + examples = Example.to_example_objects(examples) + docs = [ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return guesses, backprop = self.model.begin_update(docs, drop=drop) - target = self._rehearsal_model(docs) + target = self._rehearsal_model(examples) gradient = guesses - target backprop(gradient, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): scores = self.model.ops.flatten(scores) tag_index = {tag: i for i, tag in enumerate(self.labels)} cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for gold in golds: + for ex in examples: + gold = ex.gold for tag in gold.tags: if tag is None: correct[idx] = guesses[idx] @@ -506,20 +555,20 @@ class Tagger(Pipe): d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() + docs = [ex.doc for ex in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): user_warning(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() - for raw_text, annots_brackets in get_gold_tuples(): - for annots, brackets in annots_brackets: - ids, words, tags, heads, deps, ents = annots - for tag in tags: + for example in get_examples(): + for token_annotation in example.token_annotations: + for tag in token_annotation.tags: if tag in orig_tag_map: new_tag_map[tag] = orig_tag_map[tag] else: @@ -698,14 +747,14 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): - gold_tuples = nonproj.preprocess_training_data(get_gold_tuples()) - for raw_text, annots_brackets in gold_tuples: - for annots, brackets in annots_brackets: - ids, words, tags, heads, deps, ents = annots - for i in range(len(ids)): - label = self.make_label(i, words, tags, heads, deps, ents) + gold_examples = nonproj.preprocess_training_data(get_examples()) + # for raw_text, doc_annot in gold_tuples: + for example in gold_examples: + for token_annotation in example.token_annotations: + for i in range(len(token_annotation.ids)): + label = self.make_label(i, token_annotation) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) if self.model is True: @@ -735,18 +784,17 @@ class MultitaskObjective(Tagger): scores = self.model.softmax(tokvecs) return tokvecs, scores - def get_loss(self, docs, golds, scores): - if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs), - n_golds=len(golds))) + def get_loss(self, examples, scores): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) + golds = [ex.gold for ex in examples] + docs = [ex.doc for ex in examples] for i, gold in enumerate(golds): for j in range(len(docs[i])): - # Handes alignment for tokenization differences - label = self.make_label(j, gold.words, gold.tags, - gold.heads, gold.labels, gold.ents) + # Handels alignment for tokenization differences + token_annotation = gold.get_token_annotation() + label = self.make_label(j, token_annotation) if label is None or label not in self.labels: correct[idx] = guesses[idx] else: @@ -758,39 +806,39 @@ class MultitaskObjective(Tagger): return float(loss), d_scores @staticmethod - def make_dep(i, words, tags, heads, deps, ents): - if deps[i] is None or heads[i] is None: + def make_dep(i, token_annotation): + if token_annotation.deps[i] is None or token_annotation.heads[i] is None: return None - return deps[i] + return token_annotation.deps[i] @staticmethod - def make_tag(i, words, tags, heads, deps, ents): - return tags[i] + def make_tag(i, token_annotation): + return token_annotation.tags[i] @staticmethod - def make_ent(i, words, tags, heads, deps, ents): - if ents is None: + def make_ent(i, token_annotation): + if token_annotation.entities is None: return None - return ents[i] + return token_annotation.entities[i] @staticmethod - def make_dep_tag_offset(i, words, tags, heads, deps, ents): - if deps[i] is None or heads[i] is None: + def make_dep_tag_offset(i, token_annotation): + if token_annotation.deps[i] is None or token_annotation.heads[i] is None: return None - offset = heads[i] - i + offset = token_annotation.heads[i] - i offset = min(offset, 2) offset = max(offset, -2) - return "%s-%s:%d" % (deps[i], tags[i], offset) + return "%s-%s:%d" % (token_annotation.deps[i], token_annotation.tags[i], offset) @staticmethod - def make_ent_tag(i, words, tags, heads, deps, ents): - if ents is None or ents[i] is None: + def make_ent_tag(i, token_annotation): + if token_annotation.entities is None or token_annotation.entities[i] is None: return None else: - return "%s-%s" % (tags[i], ents[i]) + return "%s-%s" % (token_annotation.tags[i], token_annotation.entities[i]) @staticmethod - def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}): + def make_sent_start(target, token_annotation, cache=True, _cache={}): """A multi-task objective for representing sentence boundaries, using BILU scheme. (O is impossible) @@ -799,6 +847,8 @@ class MultitaskObjective(Tagger): of gold data. You can pass cache=False if you know the cache will do the wrong thing. """ + words = token_annotation.words + heads = token_annotation.heads assert len(words) == len(heads) assert target < len(words), (target, len(words)) if cache: @@ -857,7 +907,7 @@ class ClozeMultitask(Pipe): def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): link_vectors_to_models(self.vocab) if self.model is True: @@ -874,25 +924,26 @@ class ClozeMultitask(Pipe): vectors = self.model.output_layer(tokvecs) return tokvecs, vectors - def get_loss(self, docs, vectors, prediction): + def get_loss(self, examples, vectors, prediction): # The simplest way to implement this would be to vstack the # token.vector values, but that's a bit inefficient, especially on GPU. # Instead we fetch the index into the vectors table for each of our tokens, # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) target = vectors[ids] loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) return float(loss), gradient - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None): pass - def rehearse(self, docs, drop=0., sgd=None, losses=None): + def rehearse(self, examples, drop=0., sgd=None, losses=None): self.require_model() + examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. - predictions, bp_predictions = self.model.begin_update(docs, drop=drop) - loss, d_predictions = self.get_loss(docs, self.vocab.vectors.data, predictions) + predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples], drop=drop) + loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions, sgd=sgd) if losses is not None: @@ -947,12 +998,21 @@ class TextCategorizer(Pipe): def labels(self, value): self.cfg["labels"] = tuple(value) - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + examples = list(examples) + docs = [self._get_doc(ex) for ex in examples] scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) - yield from docs + + if as_example: + examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + examples.append(ex) + yield from examples + else: + yield from docs def predict(self, docs): self.require_model() @@ -973,33 +1033,37 @@ class TextCategorizer(Pipe): for j, label in enumerate(self.labels): doc.cats[label] = float(scores[i, j]) - def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): + def update(self, examples, state=None, drop=0., sgd=None, losses=None): self.require_model() - if not any(len(doc) for doc in docs): + examples = Example.to_example_objects(examples) + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - scores, bp_scores = self.model.begin_update(docs, drop=drop) - loss, d_scores = self.get_loss(docs, golds, scores) + scores, bp_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + loss, d_scores = self.get_loss(examples, scores) bp_scores(d_scores, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss - def rehearse(self, docs, drop=0., sgd=None, losses=None): + def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: return + examples = Example.to_example_objects(examples) + docs=[ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return scores, bp_scores = self.model.begin_update(docs, drop=drop) - target = self._rehearsal_model(docs) + target = self._rehearsal_model(examples) gradient = scores - target bp_scores(gradient, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): + golds = [ex.gold for ex in examples] truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") for i, gold in enumerate(golds): @@ -1032,11 +1096,10 @@ class TextCategorizer(Pipe): self.labels = tuple(list(self.labels) + [label]) return 1 - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): - for raw_text, annot_brackets in get_gold_tuples(): - for _, (cats, _2) in annot_brackets: - for cat in cats: - self.add_label(cat) + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): + for example in get_examples(): + for cat in example.doc_annotation.cats: + self.add_label(cat) if self.model is True: self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") self.require_labels() @@ -1074,10 +1137,10 @@ cdef class DependencyParser(Parser): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) - def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model.tok2vec - labeller.begin_training(get_gold_tuples, pipeline=pipeline, + labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec, sgd=sgd) def __reduce__(self): @@ -1116,10 +1179,10 @@ cdef class EntityRecognizer(Parser): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) - def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model.tok2vec - labeller.begin_training(get_gold_tuples, pipeline=pipeline, + labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec) def __reduce__(self): @@ -1175,7 +1238,7 @@ class EntityLinker(Pipe): if getattr(self, "kb", None) in (None, True, False): raise ValueError(Errors.E139.format(name=self.name)) - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb() self.cfg["entity_width"] = self.kb.entity_vector_length @@ -1187,25 +1250,18 @@ class EntityLinker(Pipe): return sgd - def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): + def update(self, examples, state=None, drop=0.0, sgd=None, losses=None): self.require_model() self.require_kb() - if losses is not None: losses.setdefault(self.name, 0.0) - - if not docs or not golds: + if not examples: return 0 - - if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs), - n_golds=len(golds))) - - if isinstance(docs, Doc): - docs = [docs] - golds = [golds] + examples = Example.to_example_objects(examples) sentence_docs = [] + docs = [ex.doc for ex in examples] + golds = [ex.gold for ex in examples] for doc, gold in zip(docs, golds): ents_by_offset = dict() @@ -1219,19 +1275,19 @@ class EntityLinker(Pipe): ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): - # Currently only training on the positive instances + # Currently only training on the positive instances - we assume there is at least 1 per doc/gold if value: sentence_docs.append(ent.sent.as_doc()) sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) - loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) + loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) bp_context(d_scores, sgd=sgd) if losses is not None: losses[self.name] += loss return loss - def get_similarity_loss(self, docs, golds, scores): + def get_similarity_loss(self, golds, scores): entity_encodings = [] for gold in golds: for entity, kb_dict in gold.links.items(): @@ -1244,16 +1300,16 @@ class EntityLinker(Pipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") if scores.shape != entity_encodings.shape: - raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up")) + raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings) loss = loss / len(entity_encodings) return loss, gradients - def get_loss(self, docs, golds, scores): + def get_loss(self, examples, scores): cats = [] - for gold in golds: - for entity, kb_dict in gold.links.items(): + for ex in examples: + for entity, kb_dict in ex.gold.links.items(): for kb_id, value in kb_dict.items(): cats.append([value]) @@ -1266,17 +1322,30 @@ class EntityLinker(Pipe): loss = loss / len(cats) return loss, d_scores - def __call__(self, doc): + def __call__(self, example): + doc = self._get_doc(example) kb_ids, tensors = self.predict([doc]) self.set_annotations([doc], kb_ids, tensors=tensors) + if isinstance(example, Example): + example.doc = doc + return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + examples = list(examples) + docs = [self._get_doc(ex) for ex in examples] kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) - yield from docs + + if as_example: + examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + examples.append(ex) + yield from examples + else: + yield from docs def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ @@ -1408,7 +1477,7 @@ class EntityLinker(Pipe): util.from_disk(path, deserialize, exclude) return self - def rehearse(self, docs, sgd=None, losses=None, **config): + def rehearse(self, examples, sgd=None, losses=None, **config): raise NotImplementedError def add_label(self, label): @@ -1416,7 +1485,7 @@ class EntityLinker(Pipe): @component("sentencizer", assigns=["token.is_sent_start", "doc.sents"]) -class Sentencizer(object): +class Sentencizer(Pipe): """Segment the Doc into sentences using a rule-based strategy. DOCS: https://spacy.io/api/sentencizer @@ -1451,14 +1520,15 @@ class Sentencizer(object): def from_nlp(cls, nlp, **cfg): return cls(**cfg) - def __call__(self, doc): + def __call__(self, example): """Apply the sentencizer to a Doc and set Token.is_sent_start. - doc (Doc): The document to process. - RETURNS (Doc): The processed Doc. + example (Doc or Example): The document to process. + RETURNS (Doc or Example): The processed Doc or Example. DOCS: https://spacy.io/api/sentencizer#call """ + doc = self._get_doc(example) start = 0 seen_period = False for i, token in enumerate(doc): @@ -1472,6 +1542,9 @@ class Sentencizer(object): seen_period = True if start < len(doc): doc[start].is_sent_start = True + if isinstance(example, Example): + example.doc = doc + return example return doc def to_bytes(self, **kwargs): diff --git a/spacy/scorer.py b/spacy/scorer.py index 0b4843f41..25c6935f3 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -3,7 +3,7 @@ from __future__ import division, print_function, unicode_literals import numpy as np -from .gold import tags_to_entities, GoldParse +from .gold import tags_to_entities, GoldParse, DocAnnotation from .errors import Errors @@ -217,11 +217,10 @@ class Scorer(object): "textcats_per_cat": self.textcats_per_cat, } - def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): + def score(self, example, verbose=False, punct_labels=("p", "punct")): """Update the evaluation scores from a single Doc / GoldParse pair. - doc (Doc): The predicted annotations. - gold (GoldParse): The correct annotations. + example (Example): The predicted annotations + correct annotations. verbose (bool): Print debugging information. punct_labels (tuple): Dependency labels for punctuation. Used to evaluate dependency attachments to punctuation if `eval_punct` is @@ -229,15 +228,22 @@ class Scorer(object): DOCS: https://spacy.io/api/scorer#score """ + if isinstance(example, tuple) and len(example) == 2: + doc, gold = example + else: + gold = example.gold + doc = example.doc + if len(doc) != len(gold): - gold = GoldParse.from_annot_tuples( - doc, tuple(zip(*gold.orig_annot)) + (gold.cats,) - ) + doc_annotation = DocAnnotation(cats=gold.cats) + token_annotation = gold.orig + gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation]) + orig = gold.orig gold_deps = set() gold_deps_per_dep = {} gold_tags = set() - gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) - for id_, word, tag, head, dep, ner in gold.orig_annot: + gold_ents = set(tags_to_entities(orig.entities)) + for id_, tag, head, dep in zip(orig.ids, orig.tags, orig.heads, orig.deps): gold_tags.add((id_, tag)) if dep not in (None, "") and dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) @@ -272,7 +278,7 @@ class Scorer(object): if token.dep_.lower() not in cand_deps_per_dep: cand_deps_per_dep[token.dep_.lower()] = set() cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower())) - if "-" not in [token[-1] for token in gold.orig_annot]: + if "-" not in orig.entities: # Find all NER labels in gold and doc ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) # Set up all labels for per type scoring and prepare gold per type @@ -336,7 +342,7 @@ class Scorer(object): Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) ) if verbose: - gold_words = [item[1] for item in gold.orig_annot] + gold_words = orig.words for w_id, h_id, dep in cand_deps - gold_deps: print("F", gold_words[w_id], dep, gold_words[h_id]) for w_id, h_id, dep in gold_deps - cand_deps: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index eb39124ce..0a99609a8 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -341,10 +341,10 @@ cdef class ArcEager(TransitionSystem): for label in kwargs.get('right_labels', []): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 - for raw_text, sents in kwargs.get('gold_parses', []): - for (ids, words, tags, heads, labels, iob), ctnts in sents: - heads, labels = nonproj.projectivize(heads, labels) - for child, head, label in zip(ids, heads, labels): + for example in kwargs.get('gold_parses', []): + for token_annotation in example.token_annotations: + heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps) + for child, head, label in zip(token_annotation.ids, heads, labels): if label.upper() == 'ROOT' : label = 'ROOT' if head == child: @@ -397,7 +397,9 @@ cdef class ArcEager(TransitionSystem): self.strings[state.safe_get(i).dep])) else: predicted.add((i, state.H(i), 'ROOT')) - id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] + id_ = gold.orig.ids[gold.cand_to_gold[i]] + head = gold.orig.heads[gold.cand_to_gold[i]] + dep = gold.orig.deps[gold.cand_to_gold[i]] truth.add((id_, head, dep)) return truth == predicted diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 9f8ad418c..d791534ee 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -72,9 +72,9 @@ cdef class BiluoPushDown(TransitionSystem): for action in (BEGIN, IN, LAST, UNIT): actions[action][entity_type] = 1 moves = ('M', 'B', 'I', 'L', 'U') - for raw_text, sents in kwargs.get('gold_parses', []): - for (ids, words, tags, heads, labels, biluo), _ in sents: - for i, ner_tag in enumerate(biluo): + for example in kwargs.get('gold_parses', []): + for token_annotation in example.token_annotations: + for i, ner_tag in enumerate(token_annotation.entities): if ner_tag != 'O' and ner_tag != '-': _, label = ner_tag.split('-', 1) for action in (BEGIN, IN, LAST, UNIT): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0ed7e6952..8fec87c50 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -27,6 +27,7 @@ from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +from spacy.gold import Example from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss @@ -193,7 +194,7 @@ cdef class Parser: # Defined in subclasses, to avoid circular import raise NotImplementedError - def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg): + def init_multitask_objectives(self, get_examples, pipeline, **cfg): '''Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. @@ -203,9 +204,9 @@ cdef class Parser: ''' pass - def preprocess_gold(self, docs_golds): - for doc, gold in docs_golds: - yield doc, gold + def preprocess_gold(self, examples): + for ex in examples: + yield ex def use_params(self, params): # Can't decorate cdef class :(. Workaround. @@ -411,35 +412,31 @@ cdef class Parser: beam.check_done(_beam_utils.check_final_state, NULL) return [b for b in beams if not b.is_done] - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None): self.require_model() - if isinstance(docs, Doc) and isinstance(golds, GoldParse): - docs = [docs] - golds = [golds] - if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value='update', n_docs=len(docs), - n_golds=len(golds))) + examples = Example.to_example_objects(examples) + if losses is None: losses = {} losses.setdefault(self.name, 0.) for multitask in self._multitasks: - multitask.update(docs, golds, drop=drop, sgd=sgd) + multitask.update(examples, drop=drop, sgd=sgd) # The probability we use beam update, instead of falling back to # a greedy update beam_update_prob = self.cfg.get('beam_update_prob', 0.5) if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam(docs, golds, self.cfg.get('beam_width', 1), + return self.update_beam(examples, self.cfg.get('beam_width', 1), drop=drop, sgd=sgd, losses=losses, beam_density=self.cfg.get('beam_density', 0.001)) # Chop sequences into lengths of this many transitions, to make the # batch uniform length. cut_gold = numpy.random.choice(range(20, 100)) - states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold) + states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold) states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] # Prepare the stepwise model, and get the callback for finishing the batch - model, finish_update = self.model.begin_update(docs, drop=drop) + model, finish_update = self.model.begin_update([ex.doc for ex in examples], drop=drop) for _ in range(max_steps): if not states_golds: break @@ -454,19 +451,19 @@ cdef class Parser: finish_update(golds, sgd=sgd) return losses - def rehearse(self, docs, sgd=None, losses=None, **cfg): + def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" - if isinstance(docs, Doc): - docs = [docs] + examples = Example.to_example_objects(examples) if losses is None: losses = {} for multitask in self._multitasks: if hasattr(multitask, 'rehearse'): - multitask.rehearse(docs, losses=losses, sgd=sgd) + multitask.rehearse(examples, losses=losses, sgd=sgd) if self._rehearsal_model is None: return None losses.setdefault(self.name, 0.) + docs = [ex.doc for ex in examples] states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to @@ -494,15 +491,20 @@ cdef class Parser: losses[self.name] += loss / n_scores return losses - def update_beam(self, docs, golds, width, drop=0., sgd=None, losses=None, + def update_beam(self, examples, width, drop=0., sgd=None, losses=None, beam_density=0.0): + examples = Example.to_example_objects(examples) + docs = [ex.doc for ex in examples] + golds = [ex.gold for ex in examples] + new_golds = [] lengths = [len(d) for d in docs] states = self.moves.init_batch(docs) for gold in golds: self.moves.preprocess_gold(gold) + new_golds.append(gold) model, finish_update = self.model.begin_update(docs, drop=drop) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.nr_feature, 10000, states, golds, model.state2vec, + self.moves, self.nr_feature, 10000, states, new_golds, model.state2vec, model.vec2scores, width, drop=drop, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): @@ -522,7 +524,7 @@ cdef class Parser: for beam in beams: _beam_utils.cleanup_beam(beam) - def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500): + def _init_gold_batch(self, whole_examples, min_length=5, max_length=500): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, where N is the shortest doc. We'll make two states, one representing @@ -530,6 +532,8 @@ cdef class Parser: cdef: StateClass state Transition action + whole_docs = [ex.doc for ex in whole_examples] + whole_golds = [ex.gold for ex in whole_examples] whole_states = self.moves.init_batch(whole_docs) max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs]))) max_moves = 0 @@ -592,14 +596,14 @@ cdef class Parser: return create_default_optimizer(self.model.ops, **self.cfg.get('optimizer', {})) - def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg): if 'model' in cfg: self.model = cfg['model'] - if not hasattr(get_gold_tuples, '__call__'): - gold_tuples = get_gold_tuples - get_gold_tuples = lambda: gold_tuples + if not hasattr(get_examples, '__call__'): + gold_tuples = get_examples + get_examples = lambda: gold_tuples cfg.setdefault('min_action_freq', 30) - actions = self.moves.get_actions(gold_parses=get_gold_tuples(), + actions = self.moves.get_actions(gold_parses=get_examples(), min_freq=cfg.get('min_action_freq', 30), learn_tokens=self.cfg.get("learn_tokens", False)) for action, labels in self.moves.labels.items(): @@ -615,15 +619,14 @@ cdef class Parser: sgd = self.create_optimizer() doc_sample = [] gold_sample = [] - for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): - for annots, brackets in annots_brackets: - ids, words, tags, heads, deps, ents = annots - doc_sample.append(Doc(self.vocab, words=words)) - gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) + for example in islice(get_examples(), 1000): + parses = example.get_gold_parses(merge=False, vocab=self.vocab) + for doc, gold in parses: + doc_sample.append(doc) + gold_sample.append(gold) self.model.begin_training(doc_sample, gold_sample) if pipeline is not None: - self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) + self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) else: if sgd is None: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 53e8a9cfe..c7ed25948 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -9,6 +9,7 @@ from __future__ import unicode_literals from copy import copy +from spacy.gold import Example from ..tokens.doc cimport Doc, set_children_from_heads from ..errors import Errors @@ -77,39 +78,42 @@ def decompose(label): def is_decorated(label): return DELIMITER in label -def count_decorated_labels(gold_tuples): +def count_decorated_labels(gold_data): freqs = {} - for raw_text, sents in gold_tuples: - for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads, deco_labels = projectivize(heads, labels) + for example in gold_data: + for token_annotation in example.token_annotations: + proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) # set the label to ROOT for each root dependent - deco_labels = ['ROOT' if head == i else deco_labels[i] + deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] # count label frequencies - for label in deco_labels: + for label in deco_deps: if is_decorated(label): freqs[label] = freqs.get(label, 0) + 1 return freqs -def preprocess_training_data(gold_tuples, label_freq_cutoff=30): +def preprocess_training_data(gold_data, label_freq_cutoff=30): preprocessed = [] freqs = {} - for raw_text, sents in gold_tuples: - prepro_sents = [] - for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads, deco_labels = projectivize(heads, labels) + for example in gold_data: + new_example = Example(doc=example.doc) + for token_annotation in example.token_annotations: + proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) # set the label to ROOT for each root dependent - deco_labels = ['ROOT' if head == i else deco_labels[i] + deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] # count label frequencies if label_freq_cutoff > 0: - for label in deco_labels: + for label in deco_deps: if is_decorated(label): freqs[label] = freqs.get(label, 0) + 1 - prepro_sents.append( - ((ids, words, tags, proj_heads, deco_labels, iob), ctnts)) - preprocessed.append((raw_text, prepro_sents)) + # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ? + proj_token_dict = token_annotation.to_dict() + proj_token_dict["heads"] = proj_heads + proj_token_dict["deps"] = deco_deps + new_example.add_token_annotation(**proj_token_dict) + preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) return preprocessed @@ -203,20 +207,21 @@ def _find_new_head(token, headlabel): return token.head -def _filter_labels(gold_tuples, cutoff, freqs): +def _filter_labels(examples, cutoff, freqs): # throw away infrequent decorated labels # can't learn them reliably anyway and keeps label set smaller filtered = [] - for raw_text, sents in gold_tuples: - filtered_sents = [] - for (ids, words, tags, heads, labels, iob), ctnts in sents: + for example in examples: + new_example = Example(doc=example.doc) + for token_annotation in example.token_annotations: filtered_labels = [] - for label in labels: + for label in token_annotation.deps: if is_decorated(label) and freqs.get(label, 0) < cutoff: filtered_labels.append(decompose(label)[0]) else: filtered_labels.append(label) - filtered_sents.append( - ((ids, words, tags, heads, filtered_labels, iob), ctnts)) - filtered.append((raw_text, filtered_sents)) + filtered_token_dict = token_annotation.to_dict() + filtered_token_dict["deps"] = filtered_labels + new_example.add_token_annotation(**filtered_token_dict) + filtered.append(new_example) return filtered diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 4ab9c1e70..bee9db82e 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -37,7 +37,7 @@ def _train_parser(parser): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) - parser.update([doc], [gold], sgd=sgd, losses=losses) + parser.update((doc, gold), sgd=sgd, losses=losses) return parser @@ -51,7 +51,7 @@ def test_add_label(parser): gold = GoldParse( doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"] ) - parser.update([doc], [gold], sgd=sgd, losses=losses) + parser.update((doc, gold), sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 41b7a4861..0d9bd1ad0 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -130,18 +130,25 @@ annot_tuples = [ def test_get_oracle_actions(): + ids, words, tags, heads, deps, ents = [], [], [], [], [], [] + for id_, word, tag, head, dep, ent in annot_tuples: + ids.append(id_) + words.append(word) + tags.append(tag) + heads.append(head) + deps.append(dep) + ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) parser = DependencyParser(doc.vocab) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") - for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples): + for i, (head, dep) in enumerate(zip(heads, deps)): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) - ids, words, tags, heads, deps, ents = zip(*annot_tuples) heads, deps = projectivize(heads, deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) parser.moves.preprocess_gold(gold) diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 062c76ae3..468b3ff40 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -67,7 +67,7 @@ def test_update_doc(parser, model, doc, gold): def optimize(weights, gradient, key=None): weights -= 0.001 * gradient - parser.update([doc], [gold], sgd=optimize) + parser.update((doc, gold), sgd=optimize) @pytest.mark.xfail @@ -83,4 +83,4 @@ def test_update_doc_beam(parser, model, doc, gold): def optimize(weights, gradient, key=None): weights -= 0.001 * gradient - parser.update_beam([doc], [gold], sgd=optimize) + parser.update_beam((doc, gold), sgd=optimize) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 70beb2f60..d935494d6 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -30,7 +30,7 @@ def parser(vocab): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) - parser.update([doc], [gold], sgd=sgd, losses=losses) + parser.update((doc, gold), sgd=sgd, losses=losses) return parser diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index ef70dc013..e967fffaf 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -24,7 +24,7 @@ def test_simple_train(): ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: - nlp.update([text], [{"cats": {"answer": answer}}]) + nlp.update((text, {"cats": {"answer": answer}})) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5 diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 6d88d68c2..61d2c9cd2 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -451,7 +451,7 @@ def test_issue999(train_data): for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: - nlp.update([raw_text], [{"entities": entity_offsets}]) + nlp.update((raw_text, {"entities": entity_offsets})) with make_tempdir() as model_dir: nlp.to_disk(model_dir) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index e498417d1..ace25f8cc 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -5,6 +5,8 @@ import pytest import gc import numpy import copy + +from spacy.gold import Example from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop @@ -270,9 +272,9 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): ner = EntityRecognizer(Vocab()) - entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) - gold_parses = [(None, [(entry, None)])] - ner.moves.get_actions(gold_parses=gold_parses) + example = Example(doc=None) + example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) + ner.moves.get_actions(gold_parses=[example]) def test_issue1971(en_vocab): diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 73ff7376a..0acb25e90 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -157,7 +157,7 @@ def test_issue2800(): losses = {} random.shuffle(train_data) for statement, entities in train_data: - nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) + nlp.update((statement, entities), sgd=optimizer, losses=losses, drop=0.5) def test_issue2822(it_tokenizer): diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index 3c4836264..bc8603888 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -41,10 +41,8 @@ def test_issue3611(): batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - docs=texts, - golds=annotations, + examples=batch, sgd=optimizer, drop=0.1, losses=losses, diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index ed219573f..e774feb2d 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -41,10 +41,8 @@ def test_issue4030(): batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) nlp.update( - docs=texts, - golds=annotations, + examples=batch, sgd=optimizer, drop=0.1, losses=losses, diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index 9391c3529..b0583f717 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -19,5 +19,4 @@ def test_issue4348(): losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(texts, annotations, sgd=optimizer, losses=losses) + nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 2e1b69000..bf103a389 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -11,15 +11,14 @@ from spacy.tests.util import make_tempdir def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: - print("temp", tmpdir) json_path = tmpdir / "test4402.json" srsly.write_json(json_path, json_data) corpus = GoldCorpus(str(json_path), str(json_path)) - train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0)) + train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) # assert that the data got split into 4 sentences - assert len(train_docs) == 4 + assert len(train_data) == 4 json_data = [ diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 731a1b5c2..c1bdfcc4d 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags +from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English from spacy.tokens import Doc +from spacy.util import compounding, minibatch from .util import make_tempdir import pytest import srsly @@ -119,12 +120,13 @@ def test_roundtrip_docs_to_json(): with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) - reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + reloaded_example = next(goldcorpus.train_dataset(nlp)) + goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() - assert text == reloaded_doc.text + assert text == reloaded_example.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads @@ -140,10 +142,11 @@ def test_roundtrip_docs_to_json(): srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + reloaded_example = next(goldcorpus.train_dataset(nlp)) + goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() - assert text == reloaded_doc.text + assert text == reloaded_example.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads @@ -160,13 +163,14 @@ def test_roundtrip_docs_to_json(): srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) # load and rewrite as JSONL tuples - srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples) + srsly.write_jsonl(jsonl_file, goldcorpus.train_examples) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + reloaded_example = next(goldcorpus.train_dataset(nlp)) + goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() - assert text == reloaded_doc.text + assert text == reloaded_example.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads @@ -217,3 +221,144 @@ def test_goldparse_startswith_space(en_tokenizer): assert g.words == [" ", "a"] assert g.ner == [None, "U-DATE"] assert g.labels == [None, "ROOT"] + + +def test_gold_constructor(): + """Test that the GoldParse constructor works fine""" + nlp = English() + doc = nlp("This is a sentence") + gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) + + assert gold.cats["cat1"] + assert not gold.cats["cat2"] + assert gold.words == ["This", "is", "a", "sentence"] + + +def test_gold_orig_annot(): + nlp = English() + doc = nlp("This is a sentence") + gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) + + assert gold.orig.words == ["This", "is", "a", "sentence"] + assert gold.cats["cat1"] + + doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0}) + gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig) + assert gold2.orig.words == ["This", "is", "a", "sentence"] + assert not gold2.cats["cat1"] + + +def test_tuple_format_implicit(): + """Test tuple format with implicit GoldParse creation""" + + train_data = [ + ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), + ( + "Spotify steps up Asia expansion", + {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]}, + ), + ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}), + ] + + _train(train_data) + + +def test_tuple_format_implicit_invalid(): + """Test that an error is thrown for an implicit invalid GoldParse field""" + + train_data = [ + ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}), + ( + "Spotify steps up Asia expansion", + {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]}, + ), + ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}), + ] + + with pytest.raises(TypeError): + _train(train_data) + + +def _train(train_data): + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("ORG") + ner.add_label("LOC") + nlp.add_pipe(ner) + + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + +tokens_1 = { + "ids": [1, 2, 3], + "words": ["Hi", "there", "everyone"], + "tags": ["INTJ", "ADV", "PRON"], +} + +tokens_2 = { + "ids": [1, 2, 3, 4], + "words": ["It", "is", "just", "me"], + "tags": ["PRON", "AUX", "ADV", "PRON"], +} + +text0 = "Hi there everyone It is just me" + + +def test_merge_sents(): + nlp = English() + example = Example() + example.add_token_annotation(**tokens_1) + example.add_token_annotation(**tokens_2) + assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 + assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object + + merged_example = example.merge_sents() + + token_annotation_1 = example.token_annotations[0] + assert token_annotation_1.ids == [1, 2, 3] + assert token_annotation_1.words == ["Hi", "there", "everyone"] + assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] + + token_annotation_m = merged_example.token_annotations[0] + assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7] + assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"] + assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"] + + +def test_tuples_to_example(): + ex = Example() + ex.add_token_annotation(**tokens_1) + ex.add_token_annotation(**tokens_2) + ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0}) + ex_dict = ex.to_dict() + + token_dicts = [ + { + "ids": [1, 2, 3], + "words": ["Hi", "there", "everyone"], + "tags": ["INTJ", "ADV", "PRON"], + "heads": [], + "deps": [], + "entities": [], + "morphology": [], + "brackets": [], + }, + { + "ids": [1, 2, 3, 4], + "words": ["It", "is", "just", "me"], + "tags": ["PRON", "AUX", "ADV", "PRON"], + "heads": [], + "deps": [], + "entities": [], + "morphology": [], + "brackets": [], + }, + ] + doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}} + + assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict} diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index d5398c145..2b0bcc15e 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -31,20 +31,20 @@ def test_language_update(nlp): doc = Doc(nlp.vocab, words=text.split(" ")) gold = GoldParse(doc, **annots) # Update with doc and gold objects - nlp.update([doc], [gold]) + nlp.update((doc, gold)) # Update with text and dict - nlp.update([text], [annots]) + nlp.update((text, annots)) # Update with doc object and dict - nlp.update([doc], [annots]) + nlp.update((doc, annots)) # Update with text and gold object - nlp.update([text], [gold]) + nlp.update((text, gold)) + # Update with empty doc and gold object + nlp.update((None, gold)) # Update badly - with pytest.raises(IndexError): - nlp.update([doc], []) - with pytest.raises(IndexError): - nlp.update([], [gold]) with pytest.raises(ValueError): - nlp.update([text], [wrongkeyannots]) + nlp.update((doc, None)) + with pytest.raises(TypeError): + nlp.update((text, wrongkeyannots)) def test_language_evaluate(nlp): diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index c59358a6b..e8d74c405 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import GoldParse +from spacy.gold import Example, GoldParse from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc @@ -40,7 +40,7 @@ def test_las_per_type(en_vocab): deps=annot["deps"], ) gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) - scorer.score(doc, gold) + scorer.score((doc, gold)) results = scorer.scores assert results["uas"] == 100 @@ -63,7 +63,7 @@ def test_las_per_type(en_vocab): ) gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) doc[0].dep_ = "compound" - scorer.score(doc, gold) + scorer.score((doc, gold)) results = scorer.scores assert results["uas"] == 100 @@ -85,8 +85,9 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) - gold = GoldParse(doc, entities=annot["entities"]) - scorer.score(doc, gold) + ex = Example(doc=doc) + ex.add_token_annotation(entities=annot["entities"]) + scorer.score(ex) results = scorer.scores assert results["ents_p"] == 100 @@ -105,8 +106,9 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) - gold = GoldParse(doc, entities=annot["entities"]) - scorer.score(doc, gold) + ex = Example(doc=doc) + ex.add_token_annotation(entities=annot["entities"]) + scorer.score(ex) results = scorer.scores assert results["ents_p"] == approx(66.66666) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b39bb1ecb..262f19941 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -158,7 +158,7 @@ cdef class Tokenizer: doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws return doc - def pipe(self, texts, batch_size=1000, n_threads=-1): + def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False): """Tokenize a stream of texts. texts: A sequence of unicode texts. diff --git a/spacy/util.py b/spacy/util.py index 74e4cc1c6..f9e51f7d5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -616,31 +616,25 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(items, size, tuples=True, count_words=len): +def minibatch_by_words(examples, size, tuples=True, count_words=len): """Create minibatches of a given number of words.""" if isinstance(size, int): size_ = itertools.repeat(size) else: size_ = size - items = iter(items) + examples = iter(examples) while True: batch_size = next(size_) batch = [] while batch_size >= 0: try: - if tuples: - doc, gold = next(items) - else: - doc = next(items) + example = next(examples) except StopIteration: if batch: yield batch return - batch_size -= count_words(doc) - if tuples: - batch.append((doc, gold)) - else: - batch.append(doc) + batch_size -= count_words(example.doc) + batch.append(example) if batch: yield batch From 3ac4e8eb7a6c688ddc7abd205e2ed7060cbf0798 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 Nov 2019 15:25:03 +0100 Subject: [PATCH 002/187] Fix minor issues in debug-data (#4636) * Add error in debug-data if no dev docs are available (see #4575) * Update debug-data for GoldCorpus / Example * Ignore None label in misaligned NER data --- spacy/cli/debug_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 76276ee56..ed19703ac 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -121,6 +121,8 @@ def debug_data( msg.text("{} training docs".format(len(train_dataset))) msg.text("{} evaluation docs".format(len(gold_dev_data))) + if not len(gold_dev_data): + msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn("{} training examples also in evaluation data".format(overlap)) @@ -181,7 +183,7 @@ def debug_data( if "ner" in pipeline: # Get all unique NER labels present in the data labels = set( - label for label in gold_train_data["ner"] if label not in ("O", "-") + label for label in gold_train_data["ner"] if label not in ("O", "-", None) ) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") @@ -601,7 +603,7 @@ def _format_labels(labels, counts=False): def _get_examples_without_label(data, label): count = 0 for ex in data: - labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")] + labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)] if label not in labels: count += 1 return count From d67b0f196a2fc09479099a52d64462527c83a647 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 Nov 2019 21:22:18 +0100 Subject: [PATCH 003/187] Fix initialization of token mappings in new align (#4640) Initialize all values in `a2b` and `b2a` since `numpy.empty()` otherwise result unspecified integers. --- spacy/gold.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index ea3589ea5..d79bc8205 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -129,6 +129,8 @@ def align(tokens_a, tokens_b): cost = 0 a2b = numpy.empty(len(tokens_a), dtype="i") b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) a2b_multi = {} b2a_multi = {} i = 0 @@ -138,7 +140,6 @@ def align(tokens_a, tokens_b): while i < len(tokens_a) and j < len(tokens_b): a = tokens_a[i][offset_a:] b = tokens_b[j][offset_b:] - a2b[i] = b2a[j] = -1 if a == b: if offset_a == offset_b == 0: a2b[i] = j From faaa832518228f29e5351676400ed8688cc4482e Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 Nov 2019 21:24:35 +0100 Subject: [PATCH 004/187] Generalize handling of tokenizer special cases (#4259) * Generalize handling of tokenizer special cases Handle tokenizer special cases more generally by using the Matcher internally to match special cases after the affix/token_match tokenization is complete. Instead of only matching special cases while processing balanced or nearly balanced prefixes and suffixes, this recognizes special cases in a wider range of contexts: * Allows arbitrary numbers of prefixes/affixes around special cases * Allows special cases separated by infixes Existing tests/settings that couldn't be preserved as before: * The emoticon '")' is no longer a supported special case * The emoticon ':)' in "example:)" is a false positive again When merged with #4258 (or the relevant cache bugfix), the affix and token_match properties should be modified to flush and reload all special cases to use the updated internal tokenization with the Matcher. * Remove accidentally added test case * Really remove accidentally added test * Reload special cases when necessary Reload special cases when affixes or token_match are modified. Skip reloading during initialization. * Update error code number * Fix offset and whitespace in Matcher special cases * Fix offset bugs when merging and splitting tokens * Set final whitespace on final token in inserted special case * Improve cache flushing in tokenizer * Separate cache and specials memory (temporarily) * Flush cache when adding special cases * Repeated `self._cache = PreshMap()` and `self._specials = PreshMap()` are necessary due to this bug: https://github.com/explosion/preshed/issues/21 * Remove reinitialized PreshMaps on cache flush * Update UD bin scripts * Update imports for `bin/` * Add all currently supported languages * Update subtok merger for new Matcher validation * Modify blinded check to look at tokens instead of lemmas (for corpora with tokens but not lemmas like Telugu) * Use special Matcher only for cases with affixes * Reinsert specials cache checks during normal tokenization for special cases as much as possible * Additionally include specials cache checks while splitting on infixes * Since the special Matcher needs consistent affix-only tokenization for the special cases themselves, introduce the argument `with_special_cases` in order to do tokenization with or without specials cache checks * After normal tokenization, postprocess with special cases Matcher for special cases containing affixes * Replace PhraseMatcher with Aho-Corasick Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays of the hash values for the relevant attribute. The implementation is based on FlashText. The speed should be similar to the previous PhraseMatcher. It is now possible to easily remove match IDs and matches don't go missing with large keyword lists / vocabularies. Fixes #4308. * Restore support for pickling * Fix internal keyword add/remove for numpy arrays * Add test for #4248, clean up test * Improve efficiency of special cases handling * Use PhraseMatcher instead of Matcher * Improve efficiency of merging/splitting special cases in document * Process merge/splits in one pass without repeated token shifting * Merge in place if no splits * Update error message number * Remove UD script modifications Only used for timing/testing, should be a separate PR * Remove final traces of UD script modifications * Update UD bin scripts * Update imports for `bin/` * Add all currently supported languages * Update subtok merger for new Matcher validation * Modify blinded check to look at tokens instead of lemmas (for corpora with tokens but not lemmas like Telugu) * Add missing loop for match ID set in search loop * Remove cruft in matching loop for partial matches There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher. * Replace dict trie with MapStruct trie * Fix how match ID hash is stored/added * Update fix for match ID vocab * Switch from map_get_unless_missing to map_get * Switch from numpy array to Token.get_struct_attr Access token attributes directly in Doc instead of making a copy of the relevant values in a numpy array. Add unsatisfactory warning for hash collision with reserved terminal hash key. (Ideally it would change the reserved terminal hash and redo the whole trie, but for now, I'm hoping there won't be collisions.) * Restructure imports to export find_matches * Implement full remove() Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added. * Switch to PhraseMatcher.find_matches * Switch to local cdef functions for span filtering * Switch special case reload threshold to variable Refer to variable instead of hard-coded threshold * Move more of special case retokenize to cdef nogil Move as much of the special case retokenization to nogil as possible. * Rewrap sort as stdsort for OS X * Rewrap stdsort with specific types * Switch to qsort * Fix merge * Improve cmp functions * Fix realloc * Fix realloc again * Initialize span struct while retokenizing * Temporarily skip retokenizing * Revert "Move more of special case retokenize to cdef nogil" This reverts commit 0b7e52c797cd8ff1548f214bd4186ebb3a7ce8b1. * Revert "Switch to qsort" This reverts commit a98d71a942fc9bca531cf5eb05cf89fa88153b60. * Fix specials check while caching * Modify URL test with emoticons The multiple suffix tests result in the emoticon `:>`, which is now retokenized into one token as a special case after the suffixes are split off. * Refactor _apply_special_cases() * Use cdef ints for span info used in multiple spots * Modify _filter_special_spans() to prefer earlier Parallel to #4414, modify _filter_special_spans() so that the earlier span is preferred for overlapping spans of the same length. * Replace MatchStruct with Entity Replace MatchStruct with Entity since the existing Entity struct is nearly identical. * Replace Entity with more general SpanC * Replace MatchStruct with SpanC * Add error in debug-data if no dev docs are available (see #4575) * Update azure-pipelines.yml * Revert "Update azure-pipelines.yml" This reverts commit ed1060cf59e5895b5fe92ad5b894fd1078ec4c49. * Use latest wasabi * Reorganise install_requires * add dframcy to universe.json (#4580) * Update universe.json [ci skip] * Fix multiprocessing for as_tuples=True (#4582) * Fix conllu script (#4579) * force extensions to avoid clash between example scripts * fix arg order and default file encoding * add example config for conllu script * newline * move extension definitions to main function * few more encodings fixes * Add load_from_docbin example [ci skip] TODO: upload the file somewhere * Update README.md * Add warnings about 3.8 (resolves #4593) [ci skip] * Fixed typo: Added space between "recognize" and "various" (#4600) * Fix DocBin.merge() example (#4599) * Replace function registries with catalogue (#4584) * Replace functions registries with catalogue * Update __init__.py * Fix test * Revert unrelated flag [ci skip] * Bugfix/dep matcher issue 4590 (#4601) * add contributor agreement for prilopes * add test for issue #4590 * fix on_match params for DependencyMacther (#4590) * Minor updates to language example sentences (#4608) * Add punctuation to Spanish example sentences * Combine multilanguage examples for lang xx * Add punctuation to nb examples * Always realloc to a larger size Avoid potential (unlikely) edge case and cymem error seen in #4604. * Add error in debug-data if no dev docs are available (see #4575) * Update debug-data for GoldCorpus / Example * Ignore None label in misaligned NER data --- .github/contributors/prilopes.md | 106 ++++++++ README.md | 10 +- bin/ud/ud_train.py | 13 +- examples/load_from_docbin.py | 45 +++ examples/training/conllu-config.json | 1 + examples/training/conllu.py | 10 +- requirements.txt | 4 +- setup.cfg | 12 +- spacy/__init__.py | 2 +- spacy/__main__.py | 4 +- spacy/cli/download.py | 5 +- spacy/cli/evaluate.py | 3 +- spacy/cli/info.py | 3 +- spacy/cli/init_model.py | 3 +- spacy/cli/link.py | 3 +- spacy/cli/package.py | 3 +- spacy/cli/pretrain.py | 3 +- spacy/cli/profile.py | 3 +- spacy/cli/train.py | 3 +- spacy/cli/validate.py | 3 +- spacy/compat.py | 5 - spacy/displacy/render.py | 4 +- spacy/errors.py | 3 + spacy/lang/es/examples.py | 16 +- spacy/lang/nb/examples.py | 6 +- spacy/lang/tokenizer_exceptions.py | 1 - spacy/lang/xx/examples.py | 99 +++++++ spacy/language.py | 7 +- spacy/matcher/dependencymatcher.pyx | 2 +- spacy/ml/common.py | 6 +- spacy/ml/tok2vec.py | 18 +- spacy/tests/regression/test_issue4590.py | 34 +++ spacy/tests/test_architectures.py | 19 ++ spacy/tests/test_register_architecture.py | 19 -- spacy/tests/tokenizer/test_exceptions.py | 9 +- spacy/tests/tokenizer/test_tokenizer.py | 21 ++ spacy/tests/tokenizer/test_urls.py | 15 +- spacy/tokenizer.pxd | 34 ++- spacy/tokenizer.pyx | 316 ++++++++++++++++++---- spacy/util.py | 113 ++------ website/docs/api/docbin.md | 4 +- website/docs/usage/101/_named-entities.md | 2 +- website/docs/usage/index.md | 11 + website/meta/universe.json | 24 ++ 44 files changed, 754 insertions(+), 273 deletions(-) create mode 100644 .github/contributors/prilopes.md create mode 100644 examples/load_from_docbin.py create mode 100644 examples/training/conllu-config.json create mode 100644 spacy/lang/xx/examples.py create mode 100644 spacy/tests/regression/test_issue4590.py create mode 100644 spacy/tests/test_architectures.py delete mode 100644 spacy/tests/test_register_architecture.py diff --git a/.github/contributors/prilopes.md b/.github/contributors/prilopes.md new file mode 100644 index 000000000..ad111d4de --- /dev/null +++ b/.github/contributors/prilopes.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Priscilla Lopes | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-11-06 | +| GitHub username | prilopes | +| Website (optional) | | diff --git a/README.md b/README.md index 99d66bb31..980fc5b0b 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,13 @@ For detailed installation instructions, see the [pip]: https://pypi.org/project/spacy/ [conda]: https://anaconda.org/conda-forge/spacy +> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary +> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI +> providers and other tooling to support it. This means that in order to run +> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile +> the library and its Cython dependencies locally. If this is causing problems +> for you, the easiest solution is to **use Python 3.7** in the meantime. + ### pip Using pip, spaCy releases are available as source packages and binary wheels (as @@ -180,9 +187,6 @@ pointing pip to a path or URL. # download best-matching version of specific model for your spaCy installation python -m spacy download en_core_web_sm -# out-of-the-box: download best-matching default model -python -m spacy download en - # pip install .tar.gz archive from path or URL pip install /Users/you/en_core_web_sm-2.2.0.tar.gz pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index b6a44b861..75bf55771 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -323,11 +323,6 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True) -Token.set_extension("begins_fused", default=False, force=True) -Token.set_extension("inside_fused", default=False, force=True) - - ################## # Initialization # ################## @@ -460,13 +455,13 @@ class TreebankPaths(object): @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), + parses_dir=("Directory to write the development parses", "positional", None, Path), corpus=( - "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", "positional", None, str, ), - parses_dir=("Directory to write the development parses", "positional", None, Path), config=("Path to json formatted config file", "option", "C", Path), limit=("Size limit", "option", "n", int), gpu_device=("Use GPU", "option", "g", int), @@ -491,6 +486,10 @@ def main( # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py new file mode 100644 index 000000000..f26e7fc49 --- /dev/null +++ b/examples/load_from_docbin.py @@ -0,0 +1,45 @@ +# coding: utf-8 +""" +Example of loading previously parsed text using spaCy's DocBin class. The example +performs an entity count to show that the annotations are available. +For more details, see https://spacy.io/usage/saving-loading#docs +Installation: +python -m spacy download en_core_web_lg +Usage: +python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy +""" +from __future__ import unicode_literals + +import spacy +from spacy.tokens import DocBin +from timeit import default_timer as timer +from collections import Counter + +EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy" + + +def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH): + nlp = spacy.load(model) + print("Reading data from {}".format(docbin_path)) + with open(docbin_path, "rb") as file_: + bytes_data = file_.read() + nr_word = 0 + start_time = timer() + entities = Counter() + docbin = DocBin().from_bytes(bytes_data) + for doc in docbin.get_docs(nlp.vocab): + nr_word += len(doc) + entities.update((e.label_, e.text) for e in doc.ents) + end_time = timer() + msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)" + wps = nr_word / (end_time - start_time) + print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps)) + print("Most common entities:") + for (label, entity), freq in entities.most_common(30): + print(freq, entity, label) + + +if __name__ == "__main__": + import plac + + plac.call(main) diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json new file mode 100644 index 000000000..9a11dd96b --- /dev/null +++ b/examples/training/conllu-config.json @@ -0,0 +1 @@ +{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0} diff --git a/examples/training/conllu.py b/examples/training/conllu.py index ba3cf450c..08febda50 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -383,20 +383,24 @@ class TreebankPaths(object): @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), + parses_dir=("Directory to write the development parses", "positional", None, Path), + config=("Path to json formatted config file", "positional", None, Config.load), corpus=( - "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", "positional", None, str, ), - parses_dir=("Directory to write the development parses", "positional", None, Path), - config=("Path to json formatted config file", "positional", None, Config.load), limit=("Size limit", "option", "n", int), ) def main(ud_dir, parses_dir, config, corpus, limit=0): # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) + paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() diff --git a/requirements.txt b/requirements.txt index ad7059f3a..12f19bb88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,14 +4,14 @@ preshed>=3.0.2,<3.1.0 thinc>=7.3.0,<7.4.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.3.0,<1.1.0 +wasabi>=0.4.0,<1.1.0 srsly>=0.1.0,<1.1.0 +catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 pathlib==1.0.1; python_version < "3.4" -importlib_metadata>=0.20; python_version < "3.8" # Optional dependencies jsonschema>=2.6.0,<3.1.0 # Development dependencies diff --git a/setup.cfg b/setup.cfg index 51e722354..940066a9e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,19 +40,21 @@ setup_requires = murmurhash>=0.28.0,<1.1.0 thinc>=7.3.0,<7.4.0 install_requires = - setuptools - numpy>=1.15.0 + # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=7.3.0,<7.4.0 blis>=0.4.0,<0.5.0 + wasabi>=0.4.0,<1.1.0 + srsly>=0.1.0,<1.1.0 + catalogue>=0.0.7,<1.1.0 + # Third-party dependencies + setuptools + numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - wasabi>=0.3.0,<1.1.0 - srsly>=0.1.0,<1.1.0 pathlib==1.0.1; python_version < "3.4" - importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] lookups = diff --git a/spacy/__init__.py b/spacy/__init__.py index 57701179f..4a0d16a49 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -15,7 +15,7 @@ from .glossary import explain from .about import __version__ from .errors import Errors, Warnings, deprecation_warning from . import util -from .util import register_architecture, get_architecture +from .util import registry from .language import component diff --git a/spacy/__main__.py b/spacy/__main__.py index 716561566..2c285095e 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -7,12 +7,10 @@ from __future__ import print_function if __name__ == "__main__": import plac import sys - from wasabi import Printer + from wasabi import msg from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data - msg = Printer() - commands = { "download": download, "link": link, diff --git a/spacy/cli/download.py b/spacy/cli/download.py index c57e2364b..19f3e7860 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -6,16 +6,13 @@ import requests import os import subprocess import sys -from wasabi import Printer +from wasabi import msg from .link import link from ..util import get_package_path from .. import about -msg = Printer() - - @plac.annotations( model=("Model to download (shortcut or name)", "positional", None, str), direct=("Force direct download of name + version", "flag", "d", bool), diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index e5b2d0f02..a3193a5cf 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function import plac from timeit import default_timer as timer -from wasabi import Printer +from wasabi import msg from ..gold import GoldCorpus from .. import util @@ -32,7 +32,6 @@ def evaluate( Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ - msg = Printer() util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 3655327ef..080d0dc77 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import plac import platform from pathlib import Path -from wasabi import Printer +from wasabi import msg import srsly from ..compat import path2str, basestring_, unicode_ @@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False): speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ - msg = Printer() if model: if util.is_package(model): model_path = util.get_package_path(model) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index c285a12a6..cda21cbcc 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -11,7 +11,7 @@ import tarfile import gzip import zipfile import srsly -from wasabi import Printer +from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning @@ -24,7 +24,6 @@ except ImportError: DEFAULT_OOV_PROB = -20 -msg = Printer() @plac.annotations( diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 6b719ffe6..8117829b5 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import plac from pathlib import Path -from wasabi import Printer +from wasabi import msg from ..compat import symlink_to, path2str from .. import util @@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None): either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ - msg = Printer() if util.is_package(origin): model_path = util.get_package_path(origin) else: diff --git a/spacy/cli/package.py b/spacy/cli/package.py index e99a6d5ff..8ed92259c 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import plac import shutil from pathlib import Path -from wasabi import Printer, get_raw_input +from wasabi import msg, get_raw_input import srsly from ..compat import path2str @@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ - msg = Printer() input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 59269cb85..68038bc5c 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -11,7 +11,7 @@ from pathlib import Path from thinc.v2v import Affine, Maxout from thinc.misc import LayerNorm as LN from thinc.neural.util import prefer_gpu -from wasabi import Printer +from wasabi import msg import srsly from spacy.gold import Example @@ -123,7 +123,6 @@ def pretrain( for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) - msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 201ab13d5..4995224f3 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -9,7 +9,7 @@ import pstats import sys import itertools import thinc.extra.datasets -from wasabi import Printer +from wasabi import msg from ..util import load_model @@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000): It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ - msg = Printer() if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 24255437c..622a9ca97 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model from timeit import default_timer as timer import shutil import srsly -from wasabi import Printer +from wasabi import msg import contextlib import random @@ -89,7 +89,6 @@ def train( # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm - msg = Printer() util.fix_random_seed() util.set_env_log(verbose) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 38f8d2313..93abad6f6 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -5,7 +5,7 @@ from pathlib import Path import sys import requests import srsly -from wasabi import Printer +from wasabi import msg from ..compat import path2str from ..util import get_data_path @@ -17,7 +17,6 @@ def validate(): Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ - msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: diff --git a/spacy/compat.py b/spacy/compat.py index 5bff28815..0ea31c6b3 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -36,11 +36,6 @@ try: except ImportError: cupy = None -try: # Python 3.8 - import importlib.metadata as importlib_metadata -except ImportError: - import importlib_metadata # noqa: F401 - try: from thinc.neural.optimizers import Optimizer # noqa: F401 except ImportError: diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 17b67940a..d6e33437b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -5,7 +5,7 @@ import uuid from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE -from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS +from ..util import minify_html, escape_html, registry from ..errors import Errors @@ -242,7 +242,7 @@ class EntityRenderer(object): "CARDINAL": "#e4e7d2", "PERCENT": "#e4e7d2", } - user_colors = get_entry_points(ENTRY_POINTS.displacy_colors) + user_colors = registry.displacy_colors.get_all() for user_color in user_colors.values(): colors.update(user_color) colors.update(options.get("colors", {})) diff --git a/spacy/errors.py b/spacy/errors.py index d2898cf53..0b6a6775c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -529,6 +529,9 @@ class Errors(object): E185 = ("Received invalid attribute in component attribute declaration: " "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.") + E187 = ("Tokenizer special cases are not allowed to modify the text. " + "This would map '{chunk}' to '{orth}' given token attributes " + "'{token_attrs}'.") # TODO: fix numbering after merging develop into master E998 = ("Can only create GoldParse's from Example's without a Doc, " diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 96ff9c1ed..0e31b56af 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -11,12 +11,12 @@ Example sentences to test spaCy and its language models. sentences = [ - "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares", - "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes", - "San Francisco analiza prohibir los robots delivery", - "Londres es una gran ciudad del Reino Unido", - "El gato come pescado", - "Veo al hombre con el telescopio", - "La araña come moscas", - "El pingüino incuba en su nido", + "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", + "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.", + "San Francisco analiza prohibir los robots delivery.", + "Londres es una gran ciudad del Reino Unido.", + "El gato come pescado.", + "Veo al hombre con el telescopio.", + "La araña come moscas.", + "El pingüino incuba en su nido.", ] diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py index 72d6b5a71..c15426ded 100644 --- a/spacy/lang/nb/examples.py +++ b/spacy/lang/nb/examples.py @@ -11,8 +11,8 @@ Example sentences to test spaCy and its language models. sentences = [ - "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar", - "Selvkjørende biler flytter forsikringsansvaret over på produsentene ", - "San Francisco vurderer å forby robotbud på fortauene", + "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.", + "Selvkjørende biler flytter forsikringsansvaret over på produsentene.", + "San Francisco vurderer å forby robotbud på fortauene.", "London er en stor by i Storbritannia.", ] diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 57771cca4..3ea2bc3e9 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -114,7 +114,6 @@ emoticons = set( (-: =) (= -") :] :-] [: diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py new file mode 100644 index 000000000..38cd5e0cd --- /dev/null +++ b/spacy/lang/xx/examples.py @@ -0,0 +1,99 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.de.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +# combined examples from de/en/es/fr/it/nl/pl/pt/ru + +sentences = [ + "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", + "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", + "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", + "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", + "San Francisco erwägt Verbot von Lieferrobotern", + "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", + "Wo bist du?", + "Was ist die Hauptstadt von Deutschland?", + "Apple is looking at buying U.K. startup for $1 billion", + "Autonomous cars shift insurance liability toward manufacturers", + "San Francisco considers banning sidewalk delivery robots", + "London is a big city in the United Kingdom.", + "Where are you?", + "Who is the president of France?", + "What is the capital of the United States?", + "When was Barack Obama born?", + "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", + "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.", + "San Francisco analiza prohibir los robots delivery.", + "Londres es una gran ciudad del Reino Unido.", + "El gato come pescado.", + "Veo al hombre con el telescopio.", + "La araña come moscas.", + "El pingüino incuba en su nido.", + "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars", + "Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs", + "San Francisco envisage d'interdire les robots coursiers sur les trottoirs", + "Londres est une grande ville du Royaume-Uni", + "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe", + "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon", + "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule", + "Nouvelles attaques de Trump contre le maire de Londres", + "Où es-tu ?", + "Qui est le président de la France ?", + "Où est la capitale des États-Unis ?", + "Quand est né Barack Obama ?", + "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", + "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", + "San Francisco prevede di bandire i robot di consegna porta a porta", + "Londra è una grande città del Regno Unito.", + "Apple overweegt om voor 1 miljard een U.K. startup te kopen", + "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten", + "San Francisco overweegt robots op voetpaden te verbieden", + "Londen is een grote stad in het Verenigd Koninkrijk", + "Poczuł przyjemną woń mocnej kawy.", + "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", + "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", + "Nowy abonament pod lupą Komisji Europejskiej", + "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", + "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", + "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.", + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.." + "São Francisco considera banir os robôs de entrega que andam pelas calçadas.", + "Londres é a maior cidade do Reino Unido.", + # Translations from English: + "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", + "Беспилотные автомобили перекладывают страховую ответственность на производителя", + "В Сан-Франциско рассматривается возможность запрета роботов-курьеров, которые перемещаются по тротуару", + "Лондон — это большой город в Соединённом Королевстве", + # Native Russian sentences: + # Colloquial: + "Да, нет, наверное!", # Typical polite refusal + "Обратите внимание на необыкновенную красоту этого города-героя Москвы, столицы нашей Родины!", # From a tour guide speech + # Examples of Bookish Russian: + # Quote from "The Golden Calf" + "Рио-де-Жанейро — это моя мечта, и не смейте касаться её своими грязными лапами!", + # Quotes from "Ivan Vasilievich changes his occupation" + "Ты пошто боярыню обидел, смерд?!!", + "Оставь меня, старушка, я в печали!", + # Quotes from Dostoevsky: + "Уж коли я, такой же, как и ты, человек грешный, над тобой умилился и пожалел тебя, кольми паче бог", + "В мечтах я нередко, говорит, доходил до страстных помыслов о служении человечеству и может быть действительно пошел бы на крест за людей, если б это вдруг как-нибудь потребовалось, а между тем я двух дней не в состоянии прожить ни с кем в одной комнате, о чем знаю из опыта", + "Зато всегда так происходило, что чем более я ненавидел людей в частности, тем пламеннее становилась любовь моя к человечеству вообще", + # Quotes from Chekhov: + "Ненужные дела и разговоры всё об одном отхватывают на свою долю лучшую часть времени, лучшие силы, и в конце концов остается какая-то куцая, бескрылая жизнь, какая-то чепуха, и уйти и бежать нельзя, точно сидишь в сумасшедшем доме или в арестантских ротах!", + # Quotes from Turgenev: + "Нравится тебе женщина, старайся добиться толку; а нельзя — ну, не надо, отвернись — земля не клином сошлась", + "Узенькое местечко, которое я занимаю, до того крохотно в сравнении с остальным пространством, где меня нет и где дела до меня нет; и часть времени, которую мне удастся прожить, так ничтожна перед вечностью, где меня не было и не будет...", + # Quotes from newspapers: + # Komsomolskaya Pravda: + "На заседании президиума правительства Москвы принято решение присвоить статус инвестиционного приоритетного проекта города Москвы киностудии Союзмультфильм", + "Глава Минобороны Сергей Шойгу заявил, что обстановка на этом стратегическом направлении требует непрерывного совершенствования боевого состава войск", + # Argumenty i Facty: + "На реплику лже-Говина — дескать, он (Волков) будет лучшим революционером — Стамп с энтузиазмом ответил: Непременно!", +] diff --git a/spacy/language.py b/spacy/language.py index 3106c6afe..c84f597d9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -53,8 +53,8 @@ class BaseDefaults(object): filenames = {name: root / filename for name, filename in cls.resources} if LANG in cls.lex_attr_getters: lang = cls.lex_attr_getters[LANG](None) - user_lookups = util.get_entry_point(util.ENTRY_POINTS.lookups, lang, {}) - filenames.update(user_lookups) + if lang in util.registry.lookups: + filenames.update(util.registry.lookups.get(lang)) lookups = Lookups() for name, filename in filenames.items(): data = util.load_language_data(filename) @@ -157,7 +157,7 @@ class Language(object): 100,000 characters in one text. RETURNS (Language): The newly constructed object. """ - user_factories = util.get_entry_points(util.ENTRY_POINTS.factories) + user_factories = util.registry.factories.get_all() self.factories.update(user_factories) self._meta = dict(meta) self._path = None @@ -741,6 +741,7 @@ class Language(object): texts, batch_size=batch_size, disable=disable, + n_process=n_process, component_cfg=component_cfg, as_example=False ) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ae2ad3ca6..56d27024d 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -240,7 +240,7 @@ cdef class DependencyMatcher: for i, (ent_id, nodes) in enumerate(matched_key_trees): on_match = self._callbacks.get(ent_id) if on_match is not None: - on_match(self, doc, i, matches) + on_match(self, doc, i, matched_key_trees) return matched_key_trees def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees): diff --git a/spacy/ml/common.py b/spacy/ml/common.py index 963d4dc35..f90b53a15 100644 --- a/spacy/ml/common.py +++ b/spacy/ml/common.py @@ -3,10 +3,10 @@ from __future__ import unicode_literals from thinc.api import chain from thinc.v2v import Maxout from thinc.misc import LayerNorm -from ..util import register_architecture, make_layer +from ..util import registry, make_layer -@register_architecture("thinc.FeedForward.v1") +@registry.architectures.register("thinc.FeedForward.v1") def FeedForward(config): layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]] model = chain(*layers) @@ -14,7 +14,7 @@ def FeedForward(config): return model -@register_architecture("spacy.LayerNormalizedMaxout.v1") +@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(config): width = config["width"] pieces = config["pieces"] diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 0b30551b5..8f86475ef 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -6,11 +6,11 @@ from thinc.v2v import Maxout, Model from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow from thinc.misc import Residual, LayerNorm, FeatureExtracter -from ..util import make_layer, register_architecture +from ..util import make_layer, registry from ._wire import concatenate_lists -@register_architecture("spacy.Tok2Vec.v1") +@registry.architectures.register("spacy.Tok2Vec.v1") def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) @@ -24,13 +24,13 @@ def Tok2Vec(config): return tok2vec -@register_architecture("spacy.Doc2Feats.v1") +@registry.architectures.register("spacy.Doc2Feats.v1") def Doc2Feats(config): columns = config["columns"] return FeatureExtracter(columns) -@register_architecture("spacy.MultiHashEmbed.v1") +@registry.architectures.register("spacy.MultiHashEmbed.v1") def MultiHashEmbed(config): # For backwards compatibility with models before the architecture registry, # we have to be careful to get exactly the same model structure. One subtle @@ -78,7 +78,7 @@ def MultiHashEmbed(config): return layer -@register_architecture("spacy.CharacterEmbed.v1") +@registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(config): from .. import _ml @@ -94,7 +94,7 @@ def CharacterEmbed(config): return model -@register_architecture("spacy.MaxoutWindowEncoder.v1") +@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") def MaxoutWindowEncoder(config): nO = config["width"] nW = config["window_size"] @@ -110,7 +110,7 @@ def MaxoutWindowEncoder(config): return model -@register_architecture("spacy.MishWindowEncoder.v1") +@registry.architectures.register("spacy.MishWindowEncoder.v1") def MishWindowEncoder(config): from thinc.v2v import Mish @@ -124,12 +124,12 @@ def MishWindowEncoder(config): return model -@register_architecture("spacy.PretrainedVectors.v1") +@registry.architectures.register("spacy.PretrainedVectors.v1") def PretrainedVectors(config): return StaticVectors(config["vectors_name"], config["width"], config["column"]) -@register_architecture("spacy.TorchBiLSTMEncoder.v1") +@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") def TorchBiLSTMEncoder(config): import torch.nn from thinc.extra.wrappers import PyTorchWrapperRNN diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py new file mode 100644 index 000000000..6a43dfea9 --- /dev/null +++ b/spacy/tests/regression/test_issue4590.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from mock import Mock +from spacy.matcher import DependencyMatcher +from ..util import get_doc + + +def test_issue4590(en_vocab): + """Test that matches param in on_match method are the same as matches run with no on_match method""" + pattern = [ + {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, + {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, + {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, + ] + + on_match = Mock() + + matcher = DependencyMatcher(en_vocab) + matcher.add("pattern", on_match, pattern) + + text = "The quick brown fox jumped over the lazy fox" + heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] + deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] + + doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) + + matches = matcher(doc) + + on_match_args = on_match.call_args + + assert on_match_args[0][3] == matches + diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py new file mode 100644 index 000000000..77f1af020 --- /dev/null +++ b/spacy/tests/test_architectures.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy import registry +from thinc.v2v import Affine +from catalogue import RegistryError + + +@registry.architectures.register("my_test_function") +def create_model(nr_in, nr_out): + return Affine(nr_in, nr_out) + + +def test_get_architecture(): + arch = registry.architectures.get("my_test_function") + assert arch is create_model + with pytest.raises(RegistryError): + registry.architectures.get("not_an_existing_key") diff --git a/spacy/tests/test_register_architecture.py b/spacy/tests/test_register_architecture.py deleted file mode 100644 index 0c1b5b16f..000000000 --- a/spacy/tests/test_register_architecture.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest -from spacy import register_architecture -from spacy import get_architecture -from thinc.v2v import Affine - - -@register_architecture("my_test_function") -def create_model(nr_in, nr_out): - return Affine(nr_in, nr_out) - - -def test_get_architecture(): - arch = get_architecture("my_test_function") - assert arch is create_model - with pytest.raises(KeyError): - get_architecture("not_an_existing_key") diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index a79363abb..c2011487e 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -7,7 +7,7 @@ import pytest def test_tokenizer_handles_emoticons(tokenizer): # Tweebo challenge (CMU) - text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ....""" tokens = tokenizer(text) assert tokens[0].text == ":o" assert tokens[1].text == ":/" @@ -28,12 +28,11 @@ def test_tokenizer_handles_emoticons(tokenizer): assert tokens[16].text == ">:(" assert tokens[17].text == ":D" assert tokens[18].text == "=|" - assert tokens[19].text == '")' - assert tokens[20].text == ":>" - assert tokens[21].text == "...." + assert tokens[19].text == ":>" + assert tokens[20].text == "...." -@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)]) +@pytest.mark.parametrize("text,length", [("108)", 2), ("XDN", 1)]) def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 803c31abf..5ac681c5e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -108,6 +108,12 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens): assert doc[1].text == tokens[1]["orth"] +@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])]) +def test_tokenizer_validate_special_case(tokenizer, text, tokens): + with pytest.raises(ValueError): + tokenizer.add_special_case(text, tokens) + + @pytest.mark.parametrize( "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])] ) @@ -120,3 +126,18 @@ def test_tokenizer_add_special_case_tag(text, tokens): assert doc[0].tag_ == tokens[0]["tag"] assert doc[0].pos_ == "NOUN" assert doc[1].text == tokens[1]["orth"] + + +def test_tokenizer_special_cases_with_affixes(tokenizer): + text = '(((_SPECIAL_ A/B, A/B-A/B")' + tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) + tokenizer.add_special_case("A/B", [{"orth": "A/B"}]) + doc = tokenizer(text) + assert [token.text for token in doc] == ["(", "(", "(", "_SPECIAL_", "A/B", ",", "A/B", "-", "A/B", '"', ")"] + + +def test_tokenizer_special_cases_with_period(tokenizer): + text = "_SPECIAL_." + tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) + doc = tokenizer(text) + assert [token.text for token in doc] == ["_SPECIAL_", "."] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 0e287aada..a1017bac8 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import pytest +from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS + URLS_BASIC = [ "http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", @@ -194,7 +196,12 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): @pytest.mark.parametrize("url", URLS_FULL) def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url): tokens = tokenizer(url + suffix1 + suffix2) - assert len(tokens) == 3 - assert tokens[0].text == url - assert tokens[1].text == suffix1 - assert tokens[2].text == suffix2 + if suffix1 + suffix2 in BASE_EXCEPTIONS: + assert len(tokens) == 2 + assert tokens[0].text == url + assert tokens[1].text == suffix1 + suffix2 + else: + assert len(tokens) == 3 + assert tokens[0].text == url + assert tokens[1].text == suffix1 + assert tokens[2].text == suffix2 diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..ba22f7782 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,10 +4,11 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC +from .structs cimport LexemeC, SpanC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, LexemesOrTokens, _Cached +from .matcher.phrasematcher cimport PhraseMatcher cdef class Tokenizer: @@ -21,15 +22,32 @@ cdef class Tokenizer: cdef object _suffix_search cdef object _infix_finditer cdef object _rules + cdef PhraseMatcher _special_matcher + cdef int _property_init_count + cdef int _property_init_max cpdef Doc tokens_from_list(self, list strings) + cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) + cdef int _apply_special_cases(self, Doc doc) except -1 + cdef void _filter_special_spans(self, vector[SpanC] &original, + vector[SpanC] &filtered, int doc_len) nogil + cdef object _prepare_special_spans(self, Doc doc, + vector[SpanC] &filtered) + cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, + object span_data) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 - cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 - cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes, int* has_special) + cdef int _try_specials(self, hash_t key, Doc tokens, + int* has_special) except -1 + cdef int _tokenize(self, Doc tokens, unicode span, hash_t key, + int* has_special, bint with_special_cases) except -1 + cdef unicode _split_affixes(self, Pool mem, unicode string, + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes, int* has_special, + bint with_special_cases) cdef int _attach_tokens(self, Doc tokens, unicode string, - vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 - - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, - int n) except -1 + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes, int* has_special, + bint with_special_cases) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, + int* has_special, int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 262f19941..13f799f84 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -5,6 +5,8 @@ from __future__ import unicode_literals from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc +from libc.string cimport memcpy, memset +from libcpp.set cimport set as stdset from cymem.cymem cimport Pool from preshed.maps cimport PreshMap cimport cython @@ -19,6 +21,9 @@ from .compat import unescape_unicode from .errors import Errors, Warnings, deprecation_warning from . import util +from .attrs import intify_attrs +from .lexeme cimport EMPTY_LEXEME +from .symbols import ORTH cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment @@ -57,9 +62,10 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - if rules is not None: - for chunk, substrings in sorted(rules.items()): - self.add_special_case(chunk, substrings) + self._special_matcher = PhraseMatcher(self.vocab) + self._load_special_cases(rules) + self._property_init_count = 0 + self._property_init_max = 4 property token_match: def __get__(self): @@ -67,7 +73,9 @@ cdef class Tokenizer: def __set__(self, token_match): self._token_match = token_match - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property prefix_search: def __get__(self): @@ -75,7 +83,9 @@ cdef class Tokenizer: def __set__(self, prefix_search): self._prefix_search = prefix_search - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property suffix_search: def __get__(self): @@ -83,7 +93,9 @@ cdef class Tokenizer: def __set__(self, suffix_search): self._suffix_search = suffix_search - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 property infix_finditer: def __get__(self): @@ -91,7 +103,9 @@ cdef class Tokenizer: def __set__(self, infix_finditer): self._infix_finditer = infix_finditer - self._flush_cache() + self._reload_special_cases() + if self._property_init_count <= self._property_init_max: + self._property_init_count += 1 def __reduce__(self): args = (self.vocab, @@ -106,7 +120,6 @@ cdef class Tokenizer: deprecation_warning(Warnings.W002) return Doc(self.vocab, words=strings) - @cython.boundscheck(False) def __call__(self, unicode string): """Tokenize a string. @@ -115,6 +128,17 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#call """ + doc = self._tokenize_affixes(string, True) + self._apply_special_cases(doc) + return doc + + @cython.boundscheck(False) + cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): + """Tokenize according to affix and token_match settings. + + string (unicode): The string to tokenize. + RETURNS (Doc): A container for linguistic annotations. + """ if len(string) >= (2 ** 30): raise ValueError(Errors.E025.format(length=len(string))) cdef int length = len(string) @@ -123,7 +147,9 @@ cdef class Tokenizer: return doc cdef int i = 0 cdef int start = 0 - cdef bint cache_hit + cdef int has_special = 0 + cdef bint specials_hit = 0 + cdef bint cache_hit = 0 cdef bint in_ws = string[0].isspace() cdef unicode span # The task here is much like string.split, but not quite @@ -139,9 +165,14 @@ cdef class Tokenizer: # we don't have to create the slice when we hit the cache. span = string[start:i] key = hash_string(span) - cache_hit = self._try_cache(key, doc) - if not cache_hit: - self._tokenize(doc, span, key) + specials_hit = 0 + cache_hit = 0 + if with_special_cases: + specials_hit = self._try_specials(key, doc, &has_special) + if not specials_hit: + cache_hit = self._try_cache(key, doc) + if not specials_hit and not cache_hit: + self._tokenize(doc, span, key, &has_special, with_special_cases) if uc == ' ': doc.c[doc.length - 1].spacy = True start = i + 1 @@ -152,9 +183,14 @@ cdef class Tokenizer: if start < i: span = string[start:] key = hash_string(span) - cache_hit = self._try_cache(key, doc) - if not cache_hit: - self._tokenize(doc, span, key) + specials_hit = 0 + cache_hit = 0 + if with_special_cases: + specials_hit = self._try_specials(key, doc, &has_special) + if not specials_hit: + cache_hit = self._try_cache(key, doc) + if not specials_hit and not cache_hit: + self._tokenize(doc, span, key, &has_special, with_special_cases) doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws return doc @@ -174,23 +210,141 @@ cdef class Tokenizer: yield self(text) def _flush_cache(self): - self._reset_cache([key for key in self._cache if not key in self._specials]) + self._reset_cache([key for key in self._cache]) def _reset_cache(self, keys): for k in keys: + cached = <_Cached*>self._cache.get(k) del self._cache[k] - if not k in self._specials: - cached = <_Cached*>self._cache.get(k) - if cached is not NULL: - self.mem.free(cached) + if cached is not NULL: + self.mem.free(cached) - def _reset_specials(self): + def _flush_specials(self): for k in self._specials: cached = <_Cached*>self._specials.get(k) del self._specials[k] if cached is not NULL: self.mem.free(cached) + cdef int _apply_special_cases(self, Doc doc) except -1: + """Retokenize doc according to special cases. + + doc (Doc): Document. + """ + cdef int i + cdef int max_length = 0 + cdef bint modify_in_place + cdef Pool mem = Pool() + cdef vector[SpanC] c_matches + cdef vector[SpanC] c_filtered + cdef int offset + cdef int modified_doc_length + # Find matches for special cases + self._special_matcher.find_matches(doc, &c_matches) + # Skip processing if no matches + if c_matches.size() == 0: + return True + self._filter_special_spans(c_matches, c_filtered, doc.length) + # Put span info in span.start-indexed dict and calculate maximum + # intermediate document size + (span_data, max_length, modify_in_place) = self._prepare_special_spans(doc, c_filtered) + # If modifications never increase doc length, can modify in place + if modify_in_place: + tokens = doc.c + # Otherwise create a separate array to store modified tokens + else: + tokens = mem.alloc(max_length, sizeof(TokenC)) + # Modify tokenization according to filtered special cases + offset = self._retokenize_special_spans(doc, tokens, span_data) + # Allocate more memory for doc if needed + modified_doc_length = doc.length + offset + while modified_doc_length >= doc.max_length: + doc._realloc(doc.max_length * 2) + # If not modified in place, copy tokens back to doc + if not modify_in_place: + memcpy(doc.c, tokens, max_length * sizeof(TokenC)) + for i in range(doc.length + offset, doc.length): + memset(&doc.c[i], 0, sizeof(TokenC)) + doc.c[i].lex = &EMPTY_LEXEME + doc.length = doc.length + offset + return True + + cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil: + + cdef int seen_i + cdef SpanC span + cdef stdset[int] seen_tokens + stdsort(original.begin(), original.end(), len_start_cmp) + cdef int orig_i = original.size() - 1 + while orig_i >= 0: + span = original[orig_i] + if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1): + filtered.push_back(span) + for seen_i in range(span.start, span.end): + seen_tokens.insert(seen_i) + orig_i -= 1 + stdsort(filtered.begin(), filtered.end(), start_cmp) + + cdef object _prepare_special_spans(self, Doc doc, vector[SpanC] &filtered): + spans = [doc[match.start:match.end] for match in filtered] + cdef bint modify_in_place = True + cdef int curr_length = doc.length + cdef int max_length + cdef int span_length_diff = 0 + span_data = {} + for span in spans: + rule = self._rules.get(span.text, None) + span_length_diff = 0 + if rule: + span_length_diff = len(rule) - (span.end - span.start) + if span_length_diff > 0: + modify_in_place = False + curr_length += span_length_diff + if curr_length > max_length: + max_length = curr_length + span_data[span.start] = (span.text, span.start, span.end, span_length_diff) + return (span_data, max_length, modify_in_place) + + cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, object span_data): + cdef int i = 0 + cdef int j = 0 + cdef int offset = 0 + cdef _Cached* cached + cdef int idx_offset = 0 + cdef int orig_final_spacy + cdef int orig_idx + cdef int span_start + cdef int span_end + while i < doc.length: + if not i in span_data: + tokens[i + offset] = doc.c[i] + i += 1 + else: + span = span_data[i] + span_start = span[1] + span_end = span[2] + cached = <_Cached*>self._specials.get(hash_string(span[0])) + if cached == NULL: + # Copy original tokens if no rule found + for j in range(span_end - span_start): + tokens[i + offset + j] = doc.c[i + j] + i += span_end - span_start + else: + # Copy special case tokens into doc and adjust token and + # character offsets + idx_offset = 0 + orig_final_spacy = doc.c[span_end + offset - 1].spacy + orig_idx = doc.c[i].idx + for j in range(cached.length): + tokens[i + offset + j] = cached.data.tokens[j] + tokens[i + offset + j].idx = orig_idx + idx_offset + idx_offset += cached.data.tokens[j].lex.length + \ + 1 if cached.data.tokens[j].spacy else 0 + tokens[i + offset + cached.length - 1].spacy = orig_final_spacy + i += span_end - span_start + offset += span[3] + return offset + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: @@ -204,22 +358,33 @@ cdef class Tokenizer: tokens.push_back(&cached.data.tokens[i], False) return True - cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1: + cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1: + cached = <_Cached*>self._specials.get(key) + if cached == NULL: + return False + cdef int i + for i in range(cached.length): + tokens.push_back(&cached.data.tokens[i], False) + has_special[0] = 1 + return True + + cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size - cdef int has_special = 0 orig_size = tokens.length span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, - &has_special) - self._attach_tokens(tokens, span, &prefixes, &suffixes) + has_special, with_special_cases) + self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, + with_special_cases) self._save_cached(&tokens.c[orig_size], orig_key, has_special, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, - int* has_special): + int* has_special, + bint with_special_cases): cdef size_t i cdef unicode prefix cdef unicode suffix @@ -231,29 +396,24 @@ cdef class Tokenizer: and not self.find_prefix(string) \ and not self.find_suffix(string): break - if self._specials.get(hash_string(string)) != NULL: - has_special[0] = 1 + if with_special_cases and self._specials.get(hash_string(string)) != NULL: break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: prefix = string[:pre_len] minus_pre = string[pre_len:] - # Check whether we've hit a special-case - if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: + if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) - has_special[0] = 1 break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] minus_suf = string[:-suf_len] - # Check whether we've hit a special-case - if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): + if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) - has_special[0] = 1 break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] @@ -265,15 +425,15 @@ cdef class Tokenizer: elif suf_len: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) - if string and (self._specials.get(hash_string(string)) != NULL): - has_special[0] = 1 - break return string cdef int _attach_tokens(self, Doc tokens, unicode string, vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes) except -1: - cdef bint cache_hit + vector[const LexemeC*] *suffixes, + int* has_special, + bint with_special_cases) except -1: + cdef bint specials_hit = 0 + cdef bint cache_hit = 0 cdef int split, end cdef const LexemeC* const* lexemes cdef const LexemeC* lexeme @@ -283,8 +443,12 @@ cdef class Tokenizer: for i in range(prefixes.size()): tokens.push_back(prefixes[0][i], False) if string: - cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: + if with_special_cases: + specials_hit = self._try_specials(hash_string(string), tokens, + has_special) + if not specials_hit: + cache_hit = self._try_cache(hash_string(string), tokens) + if specials_hit or cache_hit: pass elif self.token_match and self.token_match(string): # We're always saying 'no' to spaces here -- the caller will @@ -329,7 +493,7 @@ cdef class Tokenizer: tokens.push_back(lexeme, False) cdef int _save_cached(self, const TokenC* tokens, hash_t key, - int has_special, int n) except -1: + int* has_special, int n) except -1: cdef int i if n <= 0: # avoid mem alloc of zero length @@ -338,7 +502,7 @@ cdef class Tokenizer: if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 # See #1250 - if has_special: + if has_special[0]: return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n @@ -391,10 +555,24 @@ cdef class Tokenizer: match = self.suffix_search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, special_cases): + def _load_special_cases(self, special_cases): """Add special-case tokenization rules.""" - for chunk, substrings in sorted(special_cases.items()): - self.add_special_case(chunk, substrings) + if special_cases is not None: + for chunk, substrings in sorted(special_cases.items()): + self._validate_special_case(chunk, substrings) + self.add_special_case(chunk, substrings) + + def _validate_special_case(self, chunk, substrings): + """Check whether the `ORTH` fields match the string. + + string (unicode): The string to specially tokenize. + substrings (iterable): A sequence of dicts, where each dict describes + a token and its attributes. + """ + attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings] + orth = "".join([spec[ORTH] for spec in attrs]) + if chunk != orth: + raise ValueError(Errors.E187.format(chunk=chunk, orth=orth, token_attrs=substrings)) def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. @@ -406,6 +584,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#add_special_case """ + self._validate_special_case(string, substrings) substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) @@ -413,15 +592,25 @@ cdef class Tokenizer: cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) stale_special = <_Cached*>self._specials.get(key) - stale_cached = <_Cached*>self._cache.get(key) - self._flush_cache() self._specials.set(key, cached) - self._cache.set(key, cached) if stale_special is not NULL: self.mem.free(stale_special) - if stale_special != stale_cached and stale_cached is not NULL: - self.mem.free(stale_cached) self._rules[string] = substrings + self._flush_cache() + if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string): + self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) + + def _reload_special_cases(self): + try: + self._property_init_count + except AttributeError: + return + # only reload if all 4 of prefix, suffix, infix, token_match have + # have been initialized + if self.vocab is not None and self._property_init_count >= self._property_init_max: + self._flush_cache() + self._flush_specials() + self._load_special_cases(self._rules) def to_disk(self, path, **kwargs): """Save the current state to a directory. @@ -503,12 +692,9 @@ cdef class Tokenizer: if data.get("rules"): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} - self._reset_cache([key for key in self._cache]) - self._reset_specials() - self._cache = PreshMap() - self._specials = PreshMap() - for string, substrings in data.get("rules", {}).items(): - self.add_special_case(string, substrings) + self._flush_cache() + self._flush_specials() + self._load_special_cases(data.get("rules", {})) return self @@ -516,3 +702,19 @@ cdef class Tokenizer: def _get_regex_pattern(regex): """Get a pattern string for a regex, or None if the pattern is None.""" return None if regex is None else regex.__self__.pattern + + +cdef extern from "" namespace "std" nogil: + void stdsort "sort"(vector[SpanC].iterator, + vector[SpanC].iterator, + bint (*)(SpanC, SpanC)) + + +cdef bint len_start_cmp(SpanC a, SpanC b) nogil: + if a.end - a.start == b.end - b.start: + return b.start < a.start + return a.end - a.start < b.end - b.start + + +cdef bint start_cmp(SpanC a, SpanC b) nogil: + return a.start < b.start diff --git a/spacy/util.py b/spacy/util.py index f9e51f7d5..21c5ea427 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,6 +13,7 @@ import functools import itertools import numpy.random import srsly +import catalogue import sys try: @@ -27,29 +28,20 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ -from .compat import import_file, importlib_metadata +from .compat import import_file from .errors import Errors, Warnings, deprecation_warning -LANGUAGES = {} -ARCHITECTURES = {} _data_path = Path(__file__).parent / "data" _PRINT_ENV = False -# NB: Ony ever call this once! If called more than ince within the -# function, test_issue1506 hangs and it's not 100% clear why. -AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points() - - -class ENTRY_POINTS(object): - """Available entry points to register extensions.""" - - factories = "spacy_factories" - languages = "spacy_languages" - displacy_colors = "spacy_displacy_colors" - lookups = "spacy_lookups" - architectures = "spacy_architectures" +class registry(object): + languages = catalogue.create("spacy", "languages", entry_points=True) + architectures = catalogue.create("spacy", "architectures", entry_points=True) + lookups = catalogue.create("spacy", "lookups", entry_points=True) + factories = catalogue.create("spacy", "factories", entry_points=True) + displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) def set_env_log(value): @@ -65,8 +57,7 @@ def lang_class_is_loaded(lang): lang (unicode): Two-letter language code, e.g. 'en'. RETURNS (bool): Whether a Language class has been loaded. """ - global LANGUAGES - return lang in LANGUAGES + return lang in registry.languages def get_lang_class(lang): @@ -75,19 +66,16 @@ def get_lang_class(lang): lang (unicode): Two-letter language code, e.g. 'en'. RETURNS (Language): Language class. """ - global LANGUAGES - # Check if an entry point is exposed for the language code - entry_point = get_entry_point(ENTRY_POINTS.languages, lang) - if entry_point is not None: - LANGUAGES[lang] = entry_point - return entry_point - if lang not in LANGUAGES: + # Check if language is registered / entry point is available + if lang in registry.languages: + return registry.languages.get(lang) + else: try: module = importlib.import_module(".lang.%s" % lang, "spacy") except ImportError as err: raise ImportError(Errors.E048.format(lang=lang, err=err)) - LANGUAGES[lang] = getattr(module, module.__all__[0]) - return LANGUAGES[lang] + set_lang_class(lang, getattr(module, module.__all__[0])) + return registry.languages.get(lang) def set_lang_class(name, cls): @@ -96,8 +84,7 @@ def set_lang_class(name, cls): name (unicode): Name of Language class. cls (Language): Language class. """ - global LANGUAGES - LANGUAGES[name] = cls + registry.languages.register(name, func=cls) def get_data_path(require_exists=True): @@ -121,49 +108,11 @@ def set_data_path(path): _data_path = ensure_path(path) -def register_architecture(name, arch=None): - """Decorator to register an architecture. An architecture is a function - that returns a Thinc Model object. - - name (unicode): The name of the architecture to register. - arch (Model): Optional architecture if function is called directly and - not used as a decorator. - RETURNS (callable): Function to register architecture. - """ - global ARCHITECTURES - if arch is not None: - ARCHITECTURES[name] = arch - return arch - - def do_registration(arch): - ARCHITECTURES[name] = arch - return arch - - return do_registration - - def make_layer(arch_config): - arch_func = get_architecture(arch_config["arch"]) + arch_func = registry.architectures.get(arch_config["arch"]) return arch_func(arch_config["config"]) -def get_architecture(name): - """Get a model architecture function by name. Raises a KeyError if the - architecture is not found. - - name (unicode): The mame of the architecture. - RETURNS (Model): The architecture. - """ - # Check if an entry point is exposed for the architecture code - entry_point = get_entry_point(ENTRY_POINTS.architectures, name) - if entry_point is not None: - ARCHITECTURES[name] = entry_point - if name not in ARCHITECTURES: - names = ", ".join(sorted(ARCHITECTURES.keys())) - raise KeyError(Errors.E174.format(name=name, names=names)) - return ARCHITECTURES[name] - - def ensure_path(path): """Ensure string is converted to a Path. @@ -327,34 +276,6 @@ def get_package_path(name): return Path(pkg.__file__).parent -def get_entry_points(key): - """Get registered entry points from other packages for a given key, e.g. - 'spacy_factories' and return them as a dictionary, keyed by name. - - key (unicode): Entry point name. - RETURNS (dict): Entry points, keyed by name. - """ - result = {} - for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []): - result[entry_point.name] = entry_point.load() - return result - - -def get_entry_point(key, value, default=None): - """Check if registered entry point is available for a given name and - load it. Otherwise, return None. - - key (unicode): Entry point name. - value (unicode): Name of entry point to load. - default: Optional default value to return. - RETURNS: The loaded entry point or None. - """ - for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []): - if entry_point.name == value: - return entry_point.load() - return default - - def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer. diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 41ebb6075..9f12a07e6 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -109,8 +109,8 @@ raise an error if the pre-defined attrs of the two `DocBin`s don't match. > doc_bin1.add(nlp("Hello world")) > doc_bin2 = DocBin(attrs=["LEMMA", "POS"]) > doc_bin2.add(nlp("This is a sentence")) -> merged_bins = doc_bin1.merge(doc_bin2) -> assert len(merged_bins) == 2 +> doc_bin1.merge(doc_bin2) +> assert len(doc_bin1) == 2 > ``` | Argument | Type | Description | diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md index 1ecaf9fe7..0e8784187 100644 --- a/website/docs/usage/101/_named-entities.md +++ b/website/docs/usage/101/_named-entities.md @@ -1,5 +1,5 @@ A named entity is a "real-world object" that's assigned a name – for example, a -person, a country, a product or a book title. spaCy can **recognize** +person, a country, a product or a book title. spaCy can **recognize** [various types](/api/annotation#named-entities) of named entities in a document, by asking the model for a **prediction**. Because models are statistical and strongly depend on the examples they were trained on, this doesn't always work diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 441297813..2b0045bc3 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and > possible, the new docs also include notes on features that have changed in > v2.0, and features that were introduced in the new version. + + +We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8, +as we're still waiting for our CI providers and other tooling to support it. +This means that in order to run spaCy on Python 3.8, you'll need +[a compiler installed](#source) and compile the library and its Cython +dependencies locally. If this is causing problems for you, the easiest solution +is to **use Python 3.7** in the meantime. + + + ## Quickstart {hidden="true"} import QuickstartInstall from 'widgets/quickstart-install.js' diff --git a/website/meta/universe.json b/website/meta/universe.json index 749abc659..40ebfaaa7 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1861,6 +1861,30 @@ "author_links": { "github": "microsoft" } + }, + { + "id": "dframcy", + "title": "Dframcy", + "slogan": "Dataframe Integration with spaCy NLP", + "github": "yash1994/dframcy", + "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.", + "pip": "dframcy", + "category": ["pipeline", "training"], + "tags": ["pandas"], + "code_example": [ + "import spacy", + "from dframcy import DframCy", + "", + "nlp = spacy.load('en_core_web_sm')", + "dframcy = DframCy(nlp)", + "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')", + "annotation_dataframe = dframcy.to_dataframe(doc)" + ], + "author": "Yash Patadia", + "author_links": { + "twitter": "PatadiaYash", + "github": "yash1994" + } } ], From 44829950ba8ae019e60560701f7d62069f664cc9 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 23 Nov 2019 14:32:15 +0100 Subject: [PATCH 005/187] Fix Example details for train CLI / pipeline components (#4624) * Switch to train_dataset() function in train CLI * Fixes for pipe() methods in pipeline components * Don't clobber `examples` variable with `as_example` in pipe() methods * Remove unnecessary traversals of `examples` * Update Parser.pipe() for Examples * Add `as_examples` kwarg to `pipe()` with implementation to return `Example`s * Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from `Pipe`) * Fixes to Example implementation in spacy.gold * Move `make_projective` from an attribute of Example to an argument of `Example.get_gold_parses()` * Head of 0 are not treated as unset * Unset heads are set to self rather than `None` (which causes problems while projectivizing) * Check for `Doc` (not just not `None`) when creating GoldParses for pre-merged example * Don't clobber `examples` variable in `iter_gold_docs()` * Add/modify gold tests for handling projectivity * In JSON roundtrip compare results from `dev_dataset` rather than `train_dataset` to avoid projectivization (and other potential modifications) * Add test for projective train vs. nonprojective dev versions of the same `Doc` * Handle ignore_misaligned as arg rather than attr Move `ignore_misaligned` from an attribute of `Example` to an argument to `Example.get_gold_parses()`, which makes it parallel to `make_projective`. Add test with old and new align that checks whether `ignore_misaligned` errors are raised as expected (only for new align). * Remove unused attrs from gold.pxd Remove `ignore_misaligned` and `make_projective` from `gold.pxd` * Refer to Example.goldparse in iter_gold_docs() Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold` because a `None` `GoldParse` is generated with ignore_misaligned and generating it on-the-fly can raise an unwanted AlignmentError * Update test for ignore_misaligned --- spacy/cli/train.py | 2 +- spacy/gold.pxd | 2 - spacy/gold.pyx | 61 +++++++++-------- spacy/pipeline/pipes.pyx | 36 +++++----- spacy/syntax/nn_parser.pyx | 22 +++++-- spacy/tests/test_gold.py | 130 +++++++++++++++++++++++++++++++------ 6 files changed, 179 insertions(+), 74 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 622a9ca97..645d1e4d4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -340,7 +340,7 @@ def train( iter_since_best = 0 best_score = 0.0 for i in range(n_iter): - train_data = corpus.train_data( + train_data = corpus.train_dataset( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 6027d85b6..8527ba2b6 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -58,8 +58,6 @@ cdef class Example: cdef public object doc cdef public list token_annotations cdef public DocAnnotation doc_annotation - cdef public object make_projective - cdef public object ignore_misaligned cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d79bc8205..39e867a33 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -311,47 +311,50 @@ class GoldCorpus(object): ignore_misaligned=ignore_misaligned) yield from gold_examples - def train_dataset_without_preprocessing(self, nlp, gold_preproc=False): - examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc) + def train_dataset_without_preprocessing(self, nlp, gold_preproc=False, + ignore_misaligned=False): + examples = self.iter_gold_docs(nlp, self.train_examples, + gold_preproc=gold_preproc, + ignore_misaligned=ignore_misaligned) yield from examples def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned) + examples = self.iter_gold_docs(nlp, self.dev_examples, + gold_preproc=gold_preproc, + ignore_misaligned=ignore_misaligned) yield from examples @classmethod def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, make_projective=False, - ignore_misaligned=False): + noise_level=0.0, orth_variant_level=0.0, + make_projective=False, ignore_misaligned=False): """ Setting gold_preproc will result in creating a doc per 'sentence' """ for example in examples: if gold_preproc: example.doc = None else: example = example.merge_sents() - example.make_projective = make_projective - example.ignore_misaligned = ignore_misaligned - examples = cls._make_docs(nlp, example, + example_docs = cls._make_docs(nlp, example, gold_preproc, noise_level=noise_level, orth_variant_level=orth_variant_level) - examples = cls._make_golds(examples, vocab=nlp.vocab) - for ex in examples: - if ex.gold is not None: + example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned) + for ex in example_golds: + if ex.goldparse is not None: if (not max_length) or len(ex.doc) < max_length: yield ex @classmethod def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0): + var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) # gold_preproc is not used ?! if example.text is not None: - var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) var_text = add_noise(var_example.text, noise_level) var_doc = nlp.make_doc(var_text) var_example.doc = var_doc return [var_example] else: - var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) doc_examples = [] for token_annotation in var_example.token_annotations: t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) @@ -362,10 +365,13 @@ class GoldCorpus(object): return doc_examples @classmethod - def _make_golds(cls, examples, vocab=None): + def _make_golds(cls, examples, vocab=None, make_projective=False, + ignore_misaligned=False): gold_examples = [] for example in examples: - gold_parses = example.get_gold_parses(vocab=vocab) + gold_parses = example.get_gold_parses(vocab=vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned) for (doc, gold) in gold_parses: ex = Example(doc=doc) ex.goldparse = gold @@ -693,13 +699,11 @@ cdef class DocAnnotation: cdef class Example: def __init__(self, doc_annotation=None, token_annotations=None, doc=None, - make_projective=False, ignore_misaligned=False, goldparse=None): + goldparse=None): """ Doc can either be text, or an actual Doc """ self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.token_annotations = token_annotations if token_annotations else [] - self.make_projective = make_projective - self.ignore_misaligned = ignore_misaligned self.goldparse = goldparse @classmethod @@ -760,7 +764,7 @@ cdef class Example: m_ids.extend(id_ + i for id_ in t.ids) m_words.extend(t.words) m_tags.extend(t.tags) - m_heads.extend(head + i if head else None for head in t.heads) + m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads)) m_deps.extend(t.deps) m_ents.extend(t.entities) m_morph.extend(t.morphology) @@ -773,7 +777,8 @@ cdef class Example: return m_example - def get_gold_parses(self, merge=False, vocab=None): + def get_gold_parses(self, merge=False, vocab=None, make_projective=False, + ignore_misaligned=False): """Return a list of (doc, GoldParse) objects. If merge is set to True, add all Token annotations to one big list.""" d = self.doc_annotation @@ -788,20 +793,20 @@ cdef class Example: raise ValueError(Errors.E998) m_doc = Doc(vocab, words=t.words) try: - gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective) + gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective) except AlignmentError: - if self.ignore_misaligned: + if ignore_misaligned: gp = None else: raise return [(self.doc, gp)] # we only have one sentence and an appropriate doc - elif len(self.token_annotations) == 1 and self.doc is not None: + elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc): t = self.token_annotations[0] try: - gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective) + gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective) except AlignmentError: - if self.ignore_misaligned: + if ignore_misaligned: gp = None else: raise @@ -814,9 +819,9 @@ cdef class Example: raise ValueError(Errors.E998) t_doc = Doc(vocab, words=t.words) try: - gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective) + gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective) except AlignmentError: - if self.ignore_misaligned: + if ignore_misaligned: gp = None else: raise diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 1d67d8e16..04a769b27 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -61,7 +61,7 @@ class Pipe(object): return cls(nlp.vocab, **cfg) def _get_doc(self, example): - """ Use this method if the `example` method can be both a Doc or an Example """ + """ Use this method if the `example` can be both a Doc or an Example """ if isinstance(example, Doc): return example return example.doc @@ -102,7 +102,6 @@ class Pipe(object): and `set_annotations()` methods. """ for examples in util.minibatch(stream, size=batch_size): - examples = list(examples) docs = [self._get_doc(ex) for ex in examples] predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: @@ -112,11 +111,11 @@ class Pipe(object): self.set_annotations(docs, predictions) if as_example: - examples = [] + annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - examples.append(ex) - yield from examples + annotated_examples.append(ex) + yield from annotated_examples else: yield from docs @@ -312,11 +311,11 @@ class Tensorizer(Pipe): self.set_annotations(docs, tensors) if as_example: - examples = [] + annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - examples.append(ex) - yield from examples + annotated_examples.append(ex) + yield from annotated_examples else: yield from docs @@ -434,17 +433,16 @@ class Tagger(Pipe): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): for examples in util.minibatch(stream, size=batch_size): - examples = list(examples) docs = [self._get_doc(ex) for ex in examples] tag_ids, tokvecs = self.predict(docs) self.set_annotations(docs, tag_ids, tensors=tokvecs) if as_example: - examples = [] + annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - examples.append(ex) - yield from examples + annotated_examples.append(ex) + yield from annotated_examples else: yield from docs @@ -1000,17 +998,16 @@ class TextCategorizer(Pipe): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): for examples in util.minibatch(stream, size=batch_size): - examples = list(examples) docs = [self._get_doc(ex) for ex in examples] scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) if as_example: - examples = [] + annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - examples.append(ex) - yield from examples + annotated_examples.append(ex) + yield from annotated_examples else: yield from docs @@ -1333,17 +1330,16 @@ class EntityLinker(Pipe): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): for examples in util.minibatch(stream, size=batch_size): - examples = list(examples) docs = [self._get_doc(ex) for ex in examples] kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) if as_example: - examples = [] + annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - examples.append(ex) - yield from examples + annotated_examples.append(ex) + yield from annotated_examples else: yield from docs diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8fec87c50..073851d8a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -227,7 +227,8 @@ cdef class Parser: self.set_annotations([doc], states, tensors=None) return doc - def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None): + def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None, + as_example=False): """Process a stream of documents. stream: The sequence of documents to process. @@ -240,14 +241,21 @@ cdef class Parser: cdef Doc doc for batch in util.minibatch(docs, size=batch_size): batch_in_order = list(batch) - by_length = sorted(batch_in_order, key=lambda doc: len(doc)) + docs = [self._get_doc(ex) for ex in batch_in_order] + by_length = sorted(docs, key=lambda doc: len(doc)) for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): subbatch = list(subbatch) parse_states = self.predict(subbatch, beam_width=beam_width, beam_density=beam_density) self.set_annotations(subbatch, parse_states, tensors=None) - for doc in batch_in_order: - yield doc + if as_example: + annotated_examples = [] + for ex, doc in zip(batch_in_order, docs): + ex.doc = doc + annotated_examples.append(ex) + yield from annotated_examples + else: + yield from batch_in_order def require_model(self): """Raise an error if the component's model is not initialized.""" @@ -635,6 +643,12 @@ cdef class Parser: self.cfg.update(cfg) return sgd + def _get_doc(self, example): + """ Use this method if the `example` can be both a Doc or an Example """ + if isinstance(example, Doc): + return example + return example.doc + def to_disk(self, path, exclude=tuple(), **kwargs): serializers = { 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index c1bdfcc4d..b43eb3431 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,16 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals +import spacy +from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English +from spacy.syntax.nonproj import is_nonproj_tree from spacy.tokens import Doc from spacy.util import compounding, minibatch from .util import make_tempdir import pytest import srsly +@pytest.fixture +def doc(): + text = "Sarah's sister flew to Silicon Valley via London." + tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + # head of '.' is intentionally nonprojective for testing + heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] + deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] + biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + nlp = English() + doc = nlp(text) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].dep_ = deps[i] + doc[i].head = doc[heads[i]] + doc.ents = spans_from_biluo_tags(doc, biluo_tags) + doc.cats = cats + doc.is_tagged = True + doc.is_parsed = True + return doc + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] @@ -98,23 +122,14 @@ def test_iob_to_biluo(): iob_to_biluo(bad_iob) -def test_roundtrip_docs_to_json(): - text = "I flew to Silicon Valley via London." - tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] - heads = [1, 1, 1, 4, 2, 1, 5, 1] - deps = ["nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] - biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] - cats = {"TRAVEL": 1.0, "BAKING": 0.0} +def test_roundtrip_docs_to_json(doc): nlp = English() - doc = nlp(text) - for i in range(len(tags)): - doc[i].tag_ = tags[i] - doc[i].dep_ = deps[i] - doc[i].head = doc[heads[i]] - doc.ents = spans_from_biluo_tags(doc, biluo_tags) - doc.cats = cats - doc.is_tagged = True - doc.is_parsed = True + text = doc.text + tags = [t.tag_ for t in doc] + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]) + cats = doc.cats # roundtrip to JSON with make_tempdir() as tmpdir: @@ -122,7 +137,7 @@ def test_roundtrip_docs_to_json(): srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) - reloaded_example = next(goldcorpus.train_dataset(nlp)) + reloaded_example = next(goldcorpus.dev_dataset(nlp)) goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() @@ -142,7 +157,7 @@ def test_roundtrip_docs_to_json(): srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - reloaded_example = next(goldcorpus.train_dataset(nlp)) + reloaded_example = next(goldcorpus.dev_dataset(nlp)) goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() @@ -166,7 +181,7 @@ def test_roundtrip_docs_to_json(): srsly.write_jsonl(jsonl_file, goldcorpus.train_examples) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - reloaded_example = next(goldcorpus.train_dataset(nlp)) + reloaded_example = next(goldcorpus.dev_dataset(nlp)) goldparse = reloaded_example.gold assert len(doc) == goldcorpus.count_train() @@ -181,6 +196,83 @@ def test_roundtrip_docs_to_json(): assert cats["BAKING"] == goldparse.cats["BAKING"] +def test_projective_train_vs_nonprojective_dev(doc): + nlp = English() + text = doc.text + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + train_goldparse = train_reloaded_example.gold + + dev_reloaded_example = next(goldcorpus.dev_dataset(nlp)) + dev_goldparse = dev_reloaded_example.gold + + assert is_nonproj_tree([t.head.i for t in doc]) is True + assert is_nonproj_tree(train_goldparse.heads) is False + assert heads[:-1] == train_goldparse.heads[:-1] + assert heads[-1] != train_goldparse.heads[-1] + assert deps[:-1] == train_goldparse.labels[:-1] + assert deps[-1] != train_goldparse.labels[-1] + + assert heads == dev_goldparse.heads + assert deps == dev_goldparse.labels + + +def test_ignore_misaligned(doc): + nlp = English() + text = doc.text + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + + use_new_align = spacy.gold.USE_NEW_ALIGN + + spacy.gold.USE_NEW_ALIGN = False + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + data = [docs_to_json(doc)] + data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, data) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + + spacy.gold.USE_NEW_ALIGN = True + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + data = [docs_to_json(doc)] + data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, data) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + with pytest.raises(AlignmentError): + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + data = [docs_to_json(doc)] + data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, data) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + # doesn't raise an AlignmentError, but there is nothing to iterate over + # because the only example can't be aligned + train_reloaded_example = list(goldcorpus.train_dataset(nlp, + ignore_misaligned=True)) + assert len(train_reloaded_example) == 0 + + spacy.gold.USE_NEW_ALIGN = use_new_align + + # xfail while we have backwards-compatible alignment @pytest.mark.xfail @pytest.mark.parametrize( From 392c4880d9b52c045990d9916a7ed8081573b80b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 25 Nov 2019 16:03:28 +0100 Subject: [PATCH 006/187] Restructure Example with merged sents as default (#4632) * Switch to train_dataset() function in train CLI * Fixes for pipe() methods in pipeline components * Don't clobber `examples` variable with `as_example` in pipe() methods * Remove unnecessary traversals of `examples` * Update Parser.pipe() for Examples * Add `as_examples` kwarg to `pipe()` with implementation to return `Example`s * Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from `Pipe`) * Fixes to Example implementation in spacy.gold * Move `make_projective` from an attribute of Example to an argument of `Example.get_gold_parses()` * Head of 0 are not treated as unset * Unset heads are set to self rather than `None` (which causes problems while projectivizing) * Check for `Doc` (not just not `None`) when creating GoldParses for pre-merged example * Don't clobber `examples` variable in `iter_gold_docs()` * Add/modify gold tests for handling projectivity * In JSON roundtrip compare results from `dev_dataset` rather than `train_dataset` to avoid projectivization (and other potential modifications) * Add test for projective train vs. nonprojective dev versions of the same `Doc` * Handle ignore_misaligned as arg rather than attr Move `ignore_misaligned` from an attribute of `Example` to an argument to `Example.get_gold_parses()`, which makes it parallel to `make_projective`. Add test with old and new align that checks whether `ignore_misaligned` errors are raised as expected (only for new align). * Remove unused attrs from gold.pxd Remove `ignore_misaligned` and `make_projective` from `gold.pxd` * Restructure Example with merged sents as default An `Example` now includes a single `TokenAnnotation` that includes all the information from one `Doc` (=JSON `paragraph`). If required, the individual sentences can be returned as a list of examples with `Example.split_sents()` with no raw text available. * Input/output a single `Example.token_annotation` * Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries * Replace `Example.merge_sents()` with `Example.split_sents()` * Modify components to use a single `Example.token_annotation` * Pipeline components * conllu2json converter * Rework/rename `add_token_annotation()` and `add_doc_annotation()` to `set_token_annotation()` and `set_doc_annotation()`, functions that set rather then appending/extending. * Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse` * Add getters to `TokenAnnotation` to supply default values when a given attribute is not available * `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only applied on single examples, so the `GoldParse` is returned saved in the provided `Example` rather than creating a new `Example` with no other internal annotation * Update tests for API changes and `merge_sents()` vs. `split_sents()` * Refer to Example.goldparse in iter_gold_docs() Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold` because a `None` `GoldParse` is generated with ignore_misaligned and generating it on-the-fly can raise an unwanted AlignmentError * Fix make_orth_variants() Fix bug in make_orth_variants() related to conversion from multiple to one TokenAnnotation per Example. * Add basic test for make_orth_variants() * Replace try/except with conditionals * Replace default morph value with set --- spacy/cli/converters/conllu2json.py | 23 +- spacy/gold.pxd | 7 +- spacy/gold.pyx | 433 ++++++++++-------- spacy/language.py | 5 +- spacy/pipeline/pipes.pyx | 20 +- spacy/scorer.py | 2 +- spacy/syntax/arc_eager.pyx | 26 +- spacy/syntax/ner.pyx | 11 +- spacy/syntax/nonproj.pyx | 66 ++- spacy/tests/regression/test_issue1501-2000.py | 2 +- spacy/tests/test_gold.py | 107 +++-- spacy/tests/test_scorer.py | 4 +- 12 files changed, 376 insertions(+), 330 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 43216c943..ff720f4bf 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): checked_for_ner = False has_ner_tags = False for i, example in enumerate(conll_data): - for token_annotation in example.token_annotations: - if not checked_for_ner: - has_ner_tags = is_ner(token_annotation.entities[0]) - checked_for_ner = True - sentences.append(generate_sentence(token_annotation, has_ner_tags)) - # Real-sized documents could be extracted using the comments on the - # conluu document - if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) - docs.append(doc) - sentences = [] + if not checked_for_ner: + has_ner_tags = is_ner(example.token_annotation.entities[0]) + checked_for_ner = True + sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) + # Real-sized documents could be extracted using the comments on the + # conllu document + if len(sentences) % n_sents == 0: + doc = create_doc(sentences, i) + docs.append(doc) + sentences = [] return docs @@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0): print(line) raise example = Example(doc=None) - example.add_token_annotation(ids=ids, words=words, tags=tags, + example.set_token_annotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents) yield example i += 1 diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 8527ba2b6..247ff8aa1 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -25,7 +25,7 @@ cdef class GoldParse: cdef public int loss cdef public list words cdef public list tags - cdef public list morphology + cdef public list morphs cdef public list heads cdef public list labels cdef public dict orths @@ -45,7 +45,8 @@ cdef class TokenAnnotation: cdef public list heads cdef public list deps cdef public list entities - cdef public list morphology + cdef public list morphs + cdef public list sent_starts cdef public list brackets @@ -56,7 +57,7 @@ cdef class DocAnnotation: cdef class Example: cdef public object doc - cdef public list token_annotations + cdef public TokenAnnotation token_annotation cdef public DocAnnotation doc_annotation cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 39e867a33..0659ddd02 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -215,7 +215,7 @@ class GoldCorpus(object): ex_dict = example.to_dict() text = example.text srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) - n += len(example.token_annotations) + n += 1 if limit and n >= limit: break @@ -271,7 +271,7 @@ class GoldCorpus(object): raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) for example in examples: yield example - i += len(example.token_annotations) + i += 1 if limit and i >= limit: return @@ -286,15 +286,14 @@ class GoldCorpus(object): yield from self.read_examples(locs, limit=self.limit) def count_train(self): - # TODO: should this count words or sentences ? + """Returns count of words in train examples""" n = 0 i = 0 for example in self.train_examples: - for token_annotation in example.token_annotations: - n += len(token_annotation.words) - if self.limit and i >= self.limit: - break - i += 1 + n += len(example.token_annotation.words) + if self.limit and i >= self.limit: + break + i += 1 return n def train_dataset(self, nlp, gold_preproc=False, max_length=None, @@ -328,18 +327,27 @@ class GoldCorpus(object): def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, noise_level=0.0, orth_variant_level=0.0, make_projective=False, ignore_misaligned=False): - """ Setting gold_preproc will result in creating a doc per 'sentence' """ + """ Setting gold_preproc will result in creating a doc per sentence """ for example in examples: if gold_preproc: example.doc = None + split_examples = example.split_sents() + example_golds = [] + for split_example in split_examples: + split_example_docs = cls._make_docs(nlp, split_example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + split_example_golds = cls._make_golds(split_example_docs, + vocab=nlp.vocab, make_projective=make_projective, + ignore_misaligned=ignore_misaligned) + example_golds.extend(split_example_golds) else: - example = example.merge_sents() - example_docs = cls._make_docs(nlp, example, - gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned) + example_docs = cls._make_docs(nlp, example, + gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) + example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, + make_projective=make_projective, + ignore_misaligned=ignore_misaligned) for ex in example_golds: if ex.goldparse is not None: if (not max_length) or len(ex.doc) < max_length: @@ -353,35 +361,28 @@ class GoldCorpus(object): var_text = add_noise(var_example.text, noise_level) var_doc = nlp.make_doc(var_text) var_example.doc = var_doc - return [var_example] else: - doc_examples = [] - for token_annotation in var_example.token_annotations: - t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) - doc_example = Example(doc_annotation=example.doc_annotation, - token_annotations=[token_annotation], - doc=t_doc) - doc_examples.append(doc_example) - return doc_examples + var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level)) + var_example.doc = var_doc + return [var_example] @classmethod def _make_golds(cls, examples, vocab=None, make_projective=False, ignore_misaligned=False): - gold_examples = [] for example in examples: gold_parses = example.get_gold_parses(vocab=vocab, make_projective=make_projective, ignore_misaligned=ignore_misaligned) - for (doc, gold) in gold_parses: - ex = Example(doc=doc) - ex.goldparse = gold - gold_examples.append(ex) - return gold_examples + assert len(gold_parses) == 1 + assert gold_parses[0][0] == example.doc + example.goldparse = gold_parses[0][1] + return examples + def make_orth_variants(nlp, example, orth_variant_level=0.0): if random.random() >= orth_variant_level: return example - if not example.token_annotations: + if not example.token_annotation: return example raw = example.text if random.random() >= 0.5: @@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples variant_example = Example(doc=raw) - for token_annotation in example.token_annotations: - words = token_annotation.words - tags = token_annotation.tags - if not words or not tags: - # add the unmodified annotation - token_dict = token_annotation.to_dict() - variant_example.add_token_annotation(**token_dict) - else: - if lower: - words = [w.lower() for w in words] - # single variants - punct_choices = [random.choice(x["variants"]) for x in ndsv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: - words[word_idx] = punct_choices[punct_idx] - # paired variants - punct_choices = [random.choice(x["variants"]) for x in ndpv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): - # backup option: random left vs. right from pair - pair_idx = random.choice([0, 1]) - # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) - # next best option: rely on position in variants - # (may not be unambiguous, so order of variants matters) - else: - for pair in ndpv[punct_idx]["variants"]: - if words[word_idx] in pair: - pair_idx = pair.index(words[word_idx]) - words[word_idx] = punct_choices[punct_idx][pair_idx] + token_annotation = example.token_annotation + words = token_annotation.words + tags = token_annotation.tags + if not words or not tags: + # add the unmodified annotation + token_dict = token_annotation.to_dict() + variant_example.set_token_annotation(**token_dict) + else: + if lower: + words = [w.lower() for w in words] + # single variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if tags[word_idx] in ndsv[punct_idx]["tags"] \ + and words[word_idx] in ndsv[punct_idx]["variants"]: + words[word_idx] = punct_choices[punct_idx] + # paired variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] \ + and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] - token_dict = token_annotation.to_dict() - token_dict["words"] = words - token_dict["tags"] = tags - variant_example.add_token_annotation(**token_dict) + token_dict = token_annotation.to_dict() + token_dict["words"] = words + token_dict["tags"] = tags + variant_example.set_token_annotation(**token_dict) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] @@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx] raw_idx += 1 - for token_annotation in variant_example.token_annotations: - for word in token_annotation.words: - match_found = False - # add identical word - if word not in variants and raw[raw_idx:].startswith(word): - variant_raw += word - raw_idx += len(word) - match_found = True - # add variant word - else: - for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): - raw_idx += len(variant) - variant_raw += word - match_found = True - # something went wrong, abort - # (add a warning message?) - if not match_found: - return example - # add following whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 + for word in variant_example.token_annotation.words: + match_found = False + # add identical word + if word not in variants and raw[raw_idx:].startswith(word): + variant_raw += word + raw_idx += len(word) + match_found = True + # add variant word + else: + for variant in variants: + if not match_found and \ + raw[raw_idx:].startswith(variant): + raw_idx += len(variant) + variant_raw += word + match_found = True + # something went wrong, abort + # (add a warning message?) + if not match_found: + return example + # add following whitespace + while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): + variant_raw += raw[raw_idx] + raw_idx += 1 variant_example.doc = variant_raw return variant_example return variant_example @@ -521,30 +521,43 @@ def json_to_examples(doc): paragraphs = [] for paragraph in doc["paragraphs"]: example = Example(doc=paragraph.get("raw", None)) + words = [] + ids = [] + tags = [] + heads = [] + labels = [] + ner = [] + morphs = [] + sent_starts = [] + brackets = [] for sent in paragraph["sentences"]: - words = [] - ids = [] - tags = [] - heads = [] - labels = [] - ner = [] + sent_start_i = len(words) for i, token in enumerate(sent["tokens"]): words.append(token["orth"]) - ids.append(i) + ids.append(token.get('id', sent_start_i + i)) tags.append(token.get('tag', "-")) - heads.append(token.get("head", 0) + i) + heads.append(token.get("head", 0) + sent_start_i + i) labels.append(token.get("dep", "")) # Ensure ROOT label is case-insensitive if labels[-1].lower() == "root": labels[-1] = "ROOT" ner.append(token.get("ner", "-")) - example.add_token_annotation(ids=ids, words=words, tags=tags, - heads=heads, deps=labels, entities=ner, - brackets=sent.get("brackets", [])) + morphs.append(token.get("morph", {})) + if i == 0: + sent_starts.append(True) + else: + sent_starts.append(False) + if "brackets" in sent: + brackets.extend((b["first"] + sent_start_i, + b["last"] + sent_start_i, b["label"]) + for b in sent["brackets"]) cats = {} for cat in paragraph.get("cats", {}): cats[cat["label"]] = cat["value"] - example.add_doc_annotation(cats=cats) + example.set_token_annotation(ids=ids, words=words, tags=tags, + heads=heads, deps=labels, entities=ner, morphs=morphs, + sent_starts=sent_starts, brackets=brackets) + example.set_doc_annotation(cats=cats) yield example @@ -652,15 +665,16 @@ def _consume_ent(tags): cdef class TokenAnnotation: - def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None): + def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None): self.ids = ids if ids else [] self.words = words if words else [] self.tags = tags if tags else [] self.heads = heads if heads else [] self.deps = deps if deps else [] self.entities = entities if entities else [] + self.morphs = morphs if morphs else [] + self.sent_starts = sent_starts if sent_starts else [] self.brackets = brackets if brackets else [] - self.morphology = morphology if morphology else [] @classmethod def from_dict(cls, token_dict): @@ -670,7 +684,8 @@ cdef class TokenAnnotation: heads=token_dict.get("heads", None), deps=token_dict.get("deps", None), entities=token_dict.get("entities", None), - morphology=token_dict.get("morphology", None), + morphs=token_dict.get("morphs", None), + sent_starts=token_dict.get("sent_starts", None), brackets=token_dict.get("brackets", None)) def to_dict(self): @@ -680,9 +695,34 @@ cdef class TokenAnnotation: "heads": self.heads, "deps": self.deps, "entities": self.entities, - "morphology": self.morphology, + "morphs": self.morphs, + "sent_starts": self.sent_starts, "brackets": self.brackets} + def get_id(self, i): + return self.ids[i] if i < len(self.ids) else i + + def get_word(self, i): + return self.words[i] if i < len(self.words) else "" + + def get_tag(self, i): + return self.tags[i] if i < len(self.tags) else "-" + + def get_head(self, i): + return self.heads[i] if i < len(self.heads) else i + + def get_dep(self, i): + return self.deps[i] if i < len(self.deps) else "" + + def get_entity(self, i): + return self.entities[i] if i < len(self.entities) else "-" + + def get_morph(self, i): + return self.morphs[i] if i < len(self.morphs) else set() + + def get_sent_start(self, i): + return self.sent_starts[i] if i < len(self.sent_starts) else None + cdef class DocAnnotation: def __init__(self, cats=None, links=None): @@ -698,33 +738,33 @@ cdef class DocAnnotation: cdef class Example: - def __init__(self, doc_annotation=None, token_annotations=None, doc=None, + def __init__(self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None): """ Doc can either be text, or an actual Doc """ self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() - self.token_annotations = token_annotations if token_annotations else [] + self.token_annotation = token_annotation if token_annotation else TokenAnnotation() self.goldparse = goldparse @classmethod def from_gold(cls, goldparse, doc=None): doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) token_annotation = goldparse.get_token_annotation() - return cls(doc_annotation, [token_annotation], doc) + return cls(doc_annotation, token_annotation, doc) @classmethod def from_dict(cls, example_dict, doc=None): - token_dicts = example_dict["token_annotations"] - token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts] + token_dict = example_dict["token_annotation"] + token_annotation = TokenAnnotation.from_dict(token_dict) doc_dict = example_dict["doc_annotation"] doc_annotation = DocAnnotation.from_dict(doc_dict) - return cls(doc_annotation, token_annotations, doc) + return cls(doc_annotation, token_annotation, doc) def to_dict(self): """ Note that this method does NOT export the doc, only the annotations ! """ - token_dicts = [t.to_dict() for t in self.token_annotations] + token_dict = self.token_annotation.to_dict() doc_dict = self.doc_annotation.to_dict() - return {"token_annotations": token_dicts, "doc_annotation": doc_dict} + return {"token_annotation": token_dict, "doc_annotation": doc_dict} @property def text(self): @@ -737,96 +777,108 @@ cdef class Example: @property def gold(self): if self.goldparse is None: - doc, gold = self.get_gold_parses(merge=True)[0] + doc, gold = self.get_gold_parses()[0] self.goldparse = gold return self.goldparse - def add_token_annotation(self, ids=None, words=None, tags=None, heads=None, - deps=None, entities=None, morphology=None, brackets=None): - t = TokenAnnotation(ids=ids, words=words, tags=tags, + def set_token_annotation(self, ids=None, words=None, tags=None, heads=None, + deps=None, entities=None, morphs=None, + sent_starts=None, brackets=None): + self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=entities, - morphology=morphology, brackets=brackets) - self.token_annotations.append(t) + morphs=morphs, sent_starts=sent_starts, + brackets=brackets) - def add_doc_annotation(self, cats=None, links=None): + def set_doc_annotation(self, cats=None, links=None): if cats: - self.doc_annotation.cats.update(cats) + self.doc_annotation.cats = cats if links: - self.doc_annotation.links.update(links) + self.doc_annotation.links = links - def merge_sents(self): - """ Merge the list of token annotations into one object and return this new object """ - m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation) - m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], [] - m_brackets = [] - i = 0 - for t in self.token_annotations: - m_ids.extend(id_ + i for id_ in t.ids) - m_words.extend(t.words) - m_tags.extend(t.tags) - m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads)) - m_deps.extend(t.deps) - m_ents.extend(t.entities) - m_morph.extend(t.morphology) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in t.brackets) - i += len(t.ids) - m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags, - heads=m_heads, deps=m_deps, entities=m_ents, - morphology=m_morph, brackets=m_brackets) - return m_example + def split_sents(self): + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_heads = [], [], [], [] + s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] + s_brackets = [] + sent_start_i = 0 + t = self.token_annotation + split_examples = [] + for i in range(len(t.words)): + if i > 0 and t.sent_starts[i] == True: + s_example.set_token_annotation(ids=s_ids, + words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, + entities=s_ents, morphs=s_morphs, + sent_starts=s_sent_starts, brackets=s_brackets) + split_examples.append(s_example) + s_example = Example(doc=None, doc_annotation=self.doc_annotation) + s_ids, s_words, s_tags, s_heads = [], [], [], [] + s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] + s_brackets = [] + sent_start_i = i + s_ids.append(t.get_id(i)) + s_words.append(t.get_word(i)) + s_tags.append(t.get_tag(i)) + s_heads.append(t.get_head(i) - sent_start_i) + s_deps.append(t.get_dep(i)) + s_ents.append(t.get_entity(i)) + s_morphs.append(t.get_morph(i)) + s_sent_starts.append(t.get_sent_start(i)) + s_brackets.extend((b[0] - sent_start_i, + b[1] - sent_start_i, b[2]) + for b in t.brackets if b[0] == i) + i += 1 + s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, + heads=s_heads, deps=s_deps, entities=s_ents, + morphs=s_morphs, sent_starts=s_sent_starts, + brackets=s_brackets) + split_examples.append(s_example) + return split_examples - def get_gold_parses(self, merge=False, vocab=None, make_projective=False, + def get_gold_parses(self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False): """Return a list of (doc, GoldParse) objects. - If merge is set to True, add all Token annotations to one big list.""" + If merge is set to True, keep all Token annotations as one big list.""" d = self.doc_annotation - # merging different sentences + # merge == do not modify Example if merge: - merged_example = self.merge_sents() - assert(len(merged_example.token_annotations)) == 1 - t = merged_example.token_annotations[0] - m_doc = merged_example.doc - if not m_doc: + t = self.token_annotation + doc = self.doc + if not self.doc: if not vocab: raise ValueError(Errors.E998) - m_doc = Doc(vocab, words=t.words) + doc = Doc(vocab, words=t.words) try: - gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective) + gp = GoldParse.from_annotation(doc, d, t, + make_projective=make_projective) except AlignmentError: if ignore_misaligned: gp = None else: raise - return [(self.doc, gp)] - # we only have one sentence and an appropriate doc - elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc): - t = self.token_annotations[0] - try: - gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - return [(self.doc, gp)] - # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence + return [(doc, gp)] + # not merging: one GoldParse per sentence, defining docs with the words + # from each sentence else: parses = [] - for t in self.token_annotations: + split_examples = self.split_sents() + for split_example in split_examples: if not vocab: raise ValueError(Errors.E998) - t_doc = Doc(vocab, words=t.words) + split_doc = Doc(vocab, words=split_example.token_annotation.words) try: - gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective) + gp = GoldParse.from_annotation(split_doc, d, + split_example.token_annotation, + make_projective=make_projective) except AlignmentError: if ignore_misaligned: gp = None else: raise if gp is not None: - parses.append((t_doc, gp)) + parses.append((split_doc, gp)) return parses @classmethod @@ -881,9 +933,14 @@ cdef class GoldParse: """ @classmethod def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): - return cls(doc, words=token_annotation.words, tags=token_annotation.tags, - heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities, - morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links, + return cls(doc, words=token_annotation.words, + tags=token_annotation.tags, + heads=token_annotation.heads, + deps=token_annotation.deps, + entities=token_annotation.entities, + morphs=token_annotation.morphs, + cats=doc_annotation.cats, + links=doc_annotation.links, make_projective=make_projective) def get_token_annotation(self): @@ -893,9 +950,9 @@ cdef class GoldParse: return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, heads=self.heads, deps=self.labels, entities=self.ner, - morphology=self.morphology) + morphs=self.morphs) - def __init__(self, doc, words=None, tags=None, morphology=None, + def __init__(self, doc, words=None, tags=None, morphs=None, heads=None, deps=None, entities=None, make_projective=False, cats=None, links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. @@ -944,8 +1001,8 @@ cdef class GoldParse: heads = [None for _ in words] if not deps: deps = [None for _ in words] - if not morphology: - morphology = [None for _ in words] + if not morphs: + morphs = [None for _ in words] if entities is None: entities = ["-" for _ in words] elif len(entities) == 0: @@ -971,7 +1028,7 @@ cdef class GoldParse: self.heads = [None] * len(doc) self.labels = [None] * len(doc) self.ner = [None] * len(doc) - self.morphology = [None] * len(doc) + self.morphs = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -990,7 +1047,7 @@ cdef class GoldParse: self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, - heads=heads, deps=deps, entities=entities, morphology=morphology, + heads=heads, deps=deps, entities=entities, morphs=morphs, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): @@ -1000,12 +1057,12 @@ cdef class GoldParse: self.heads[i] = None self.labels[i] = None self.ner[i] = None - self.morphology[i] = set() + self.morphs[i] = set() if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] - self.morphology[i] = morphology[i2j_multi[i]] + self.morphs[i] = morphs[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last @@ -1044,7 +1101,7 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] - self.morphology[i] = morphology[gold_i] + self.morphs[i] = morphs[gold_i] if heads[gold_i] is None: self.heads[i] = None else: diff --git a/spacy/language.py b/spacy/language.py index c84f597d9..8ec602ed7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -574,9 +574,8 @@ class Language(object): # Populate vocab else: for example in get_examples(): - for token_annotation in example.token_annotations: - for word in token_annotation.words: - _ = self.vocab[word] # noqa: F841 + for word in example.token_annotation.words: + _ = self.vocab[word] # noqa: F841 if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 04a769b27..56a00e33b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -565,12 +565,11 @@ class Tagger(Pipe): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for example in get_examples(): - for token_annotation in example.token_annotations: - for tag in token_annotation.tags: - if tag in orig_tag_map: - new_tag_map[tag] = orig_tag_map[tag] - else: - new_tag_map[tag] = {POS: X} + for tag in example.token_annotation.tags: + if tag in orig_tag_map: + new_tag_map[tag] = orig_tag_map[tag] + else: + new_tag_map[tag] = {POS: X} cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, @@ -750,11 +749,10 @@ class MultitaskObjective(Tagger): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: for example in gold_examples: - for token_annotation in example.token_annotations: - for i in range(len(token_annotation.ids)): - label = self.make_label(i, token_annotation) - if label is not None and label not in self.labels: - self.labels[label] = len(self.labels) + for i in range(len(example.token_annotation.ids)): + label = self.make_label(i, example.token_annotation) + if label is not None and label not in self.labels: + self.labels[label] = len(self.labels) if self.model is True: token_vector_width = util.env_opt("token_vector_width") self.model = self.Model(len(self.labels), tok2vec=tok2vec) diff --git a/spacy/scorer.py b/spacy/scorer.py index 25c6935f3..723259acd 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -237,7 +237,7 @@ class Scorer(object): if len(doc) != len(gold): doc_annotation = DocAnnotation(cats=gold.cats) token_annotation = gold.orig - gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation]) + gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation) orig = gold.orig gold_deps = set() gold_deps_per_dep = {} diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 0a99609a8..d358c1277 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for example in kwargs.get('gold_parses', []): - for token_annotation in example.token_annotations: - heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps) - for child, head, label in zip(token_annotation.ids, heads, labels): - if label.upper() == 'ROOT' : - label = 'ROOT' - if head == child: - actions[BREAK][label] += 1 - elif head < child: - actions[RIGHT][label] += 1 - actions[REDUCE][''] += 1 - elif head > child: - actions[LEFT][label] += 1 - actions[SHIFT][''] += 1 + heads, labels = nonproj.projectivize(example.token_annotation.heads, + example.token_annotation.deps) + for child, head, label in zip(example.token_annotation.ids, heads, labels): + if label.upper() == 'ROOT' : + label = 'ROOT' + if head == child: + actions[BREAK][label] += 1 + elif head < child: + actions[RIGHT][label] += 1 + actions[REDUCE][''] += 1 + elif head > child: + actions[LEFT][label] += 1 + actions[SHIFT][''] += 1 if min_freq is not None: for action, label_freqs in actions.items(): for label, freq in list(label_freqs.items()): diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index d791534ee..7467aa342 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem): actions[action][entity_type] = 1 moves = ('M', 'B', 'I', 'L', 'U') for example in kwargs.get('gold_parses', []): - for token_annotation in example.token_annotations: - for i, ner_tag in enumerate(token_annotation.entities): - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) - for action in (BEGIN, IN, LAST, UNIT): - actions[action][label] += 1 + for i, ner_tag in enumerate(example.token_annotation.entities): + if ner_tag != 'O' and ner_tag != '-': + _, label = ner_tag.split('-', 1) + for action in (BEGIN, IN, LAST, UNIT): + actions[action][label] += 1 return actions @property diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index c7ed25948..2ec6b61ac 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -81,15 +81,15 @@ def is_decorated(label): def count_decorated_labels(gold_data): freqs = {} for example in gold_data: - for token_annotation in example.token_annotations: - proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) - # set the label to ROOT for each root dependent - deco_deps = ['ROOT' if head == i else deco_deps[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - for label in deco_deps: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 + proj_heads, deco_deps = projectivize(example.token_annotation.heads, + example.token_annotation.deps) + # set the label to ROOT for each root dependent + deco_deps = ['ROOT' if head == i else deco_deps[i] + for i, head in enumerate(proj_heads)] + # count label frequencies + for label in deco_deps: + if is_decorated(label): + freqs[label] = freqs.get(label, 0) + 1 return freqs @@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30): freqs = {} for example in gold_data: new_example = Example(doc=example.doc) - for token_annotation in example.token_annotations: - proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) - # set the label to ROOT for each root dependent - deco_deps = ['ROOT' if head == i else deco_deps[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - if label_freq_cutoff > 0: - for label in deco_deps: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 - # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ? - proj_token_dict = token_annotation.to_dict() - proj_token_dict["heads"] = proj_heads - proj_token_dict["deps"] = deco_deps - new_example.add_token_annotation(**proj_token_dict) + proj_heads, deco_deps = projectivize(example.token_annotation.heads, + example.token_annotation.deps) + # set the label to ROOT for each root dependent + deco_deps = ['ROOT' if head == i else deco_deps[i] + for i, head in enumerate(proj_heads)] + # count label frequencies + if label_freq_cutoff > 0: + for label in deco_deps: + if is_decorated(label): + freqs[label] = freqs.get(label, 0) + 1 + proj_token_dict = example.token_annotation.to_dict() + proj_token_dict["heads"] = proj_heads + proj_token_dict["deps"] = deco_deps + new_example.set_token_annotation(**proj_token_dict) preprocessed.append(new_example) if label_freq_cutoff > 0: return _filter_labels(preprocessed, label_freq_cutoff, freqs) @@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs): filtered = [] for example in examples: new_example = Example(doc=example.doc) - for token_annotation in example.token_annotations: - filtered_labels = [] - for label in token_annotation.deps: - if is_decorated(label) and freqs.get(label, 0) < cutoff: - filtered_labels.append(decompose(label)[0]) - else: - filtered_labels.append(label) - filtered_token_dict = token_annotation.to_dict() - filtered_token_dict["deps"] = filtered_labels - new_example.add_token_annotation(**filtered_token_dict) + filtered_labels = [] + for label in example.token_annotation.deps: + if is_decorated(label) and freqs.get(label, 0) < cutoff: + filtered_labels.append(decompose(label)[0]) + else: + filtered_labels.append(label) + filtered_token_dict = example.token_annotation.to_dict() + filtered_token_dict["deps"] = filtered_labels + new_example.set_token_annotation(**filtered_token_dict) filtered.append(new_example) return filtered diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index ace25f8cc..4b27901ad 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer): def test_issue1967(label): ner = EntityRecognizer(Vocab()) example = Example(doc=None) - example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) + example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) ner.moves.get_actions(gold_parses=[example]) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index b43eb3431..d1255c176 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -36,6 +36,16 @@ def doc(): return doc +@pytest.fixture() +def merged_dict(): + return { + "ids": [1, 2, 3, 4, 5, 6, 7], + "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], + "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], + "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], + } + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] @@ -231,7 +241,7 @@ def test_ignore_misaligned(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] - use_new_align = spacy.gold.USE_NEW_ALIGN + saved_use_new_align = spacy.gold.USE_NEW_ALIGN spacy.gold.USE_NEW_ALIGN = False with make_tempdir() as tmpdir: @@ -270,7 +280,25 @@ def test_ignore_misaligned(doc): ignore_misaligned=True)) assert len(train_reloaded_example) == 0 - spacy.gold.USE_NEW_ALIGN = use_new_align + spacy.gold.USE_NEW_ALIGN = saved_use_new_align + + +def test_make_orth_variants(doc): + nlp = English() + text = doc.text + deps = [t.dep_ for t in doc] + heads = [t.head.i for t in doc] + + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "test.jsonl" + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + # due to randomness, test only that this runs with no errors for now + train_reloaded_example = next(goldcorpus.train_dataset(nlp, + orth_variant_level=0.2)) + train_goldparse = train_reloaded_example.gold # xfail while we have backwards-compatible alignment @@ -386,71 +414,38 @@ def _train(train_data): nlp.update(batch, sgd=optimizer, losses=losses) -tokens_1 = { - "ids": [1, 2, 3], - "words": ["Hi", "there", "everyone"], - "tags": ["INTJ", "ADV", "PRON"], -} - -tokens_2 = { - "ids": [1, 2, 3, 4], - "words": ["It", "is", "just", "me"], - "tags": ["PRON", "AUX", "ADV", "PRON"], -} - -text0 = "Hi there everyone It is just me" - - -def test_merge_sents(): +def test_split_sents(merged_dict): nlp = English() example = Example() - example.add_token_annotation(**tokens_1) - example.add_token_annotation(**tokens_2) + example.set_token_annotation(**merged_dict) assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 - assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object + assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 - merged_example = example.merge_sents() + split_examples = example.split_sents() + assert len(split_examples) == 2 - token_annotation_1 = example.token_annotations[0] + token_annotation_1 = split_examples[0].token_annotation assert token_annotation_1.ids == [1, 2, 3] assert token_annotation_1.words == ["Hi", "there", "everyone"] assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] + assert token_annotation_1.sent_starts == [1, 0, 0] - token_annotation_m = merged_example.token_annotations[0] - assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7] - assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"] - assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"] + token_annotation_2 = split_examples[1].token_annotation + assert token_annotation_2.ids == [4, 5, 6, 7] + assert token_annotation_2.words == ["It", "is", "just", "me"] + assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2.sent_starts == [1, 0, 0, 0] -def test_tuples_to_example(): +def test_tuples_to_example(merged_dict): ex = Example() - ex.add_token_annotation(**tokens_1) - ex.add_token_annotation(**tokens_2) - ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0}) + ex.set_token_annotation(**merged_dict) + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + ex.set_doc_annotation(cats=cats) ex_dict = ex.to_dict() - token_dicts = [ - { - "ids": [1, 2, 3], - "words": ["Hi", "there", "everyone"], - "tags": ["INTJ", "ADV", "PRON"], - "heads": [], - "deps": [], - "entities": [], - "morphology": [], - "brackets": [], - }, - { - "ids": [1, 2, 3, 4], - "words": ["It", "is", "just", "me"], - "tags": ["PRON", "AUX", "ADV", "PRON"], - "heads": [], - "deps": [], - "entities": [], - "morphology": [], - "brackets": [], - }, - ] - doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}} - - assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict} + assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] + assert ex_dict["token_annotation"]["words"] == merged_dict["words"] + assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] + assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] + assert ex_dict["doc_annotation"]["cats"] == cats diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index e8d74c405..92a607e5b 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab): ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) ex = Example(doc=doc) - ex.add_token_annotation(entities=annot["entities"]) + ex.set_token_annotation(entities=annot["entities"]) scorer.score(ex) results = scorer.scores @@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab): ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) ex = Example(doc=doc) - ex.add_token_annotation(entities=annot["entities"]) + ex.set_token_annotation(entities=annot["entities"]) scorer.score(ex) results = scorer.scores From 0c9640ced3c58bca6a6838c0b2e07c3e8b115e99 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 25 Nov 2019 23:13:26 +0100 Subject: [PATCH 007/187] Replace old gold alignment with new gold alignment (#4710) Replace old gold alignment that allowed for some noise in the alignment between raw and orth with the new simpler alignment that requires that the raw and orth strings are identical except for whitespace and capitalization. * Replace old alignment with new alignment, removing `_align.pyx` and its tests * Remove all quote normalizations * Enable test for new align * Modify test case for quote normalization --- setup.py | 1 - spacy/_align.pyx | 255 -------------------------------------- spacy/gold.pyx | 47 ------- spacy/tests/test_align.py | 79 ------------ spacy/tests/test_gold.py | 20 +-- 5 files changed, 1 insertion(+), 401 deletions(-) delete mode 100644 spacy/_align.pyx delete mode 100644 spacy/tests/test_align.py diff --git a/setup.py b/setup.py index 1156e7cde..62a09aa73 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ PACKAGES = find_packages() MOD_NAMES = [ - "spacy._align", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", diff --git a/spacy/_align.pyx b/spacy/_align.pyx deleted file mode 100644 index 8ae7cdf4e..000000000 --- a/spacy/_align.pyx +++ /dev/null @@ -1,255 +0,0 @@ -# cython: infer_types=True -'''Do Levenshtein alignment, for evaluation of tokenized input. - -Random notes: - - r i n g - 0 1 2 3 4 -r 1 0 1 2 3 -a 2 1 1 2 3 -n 3 2 2 1 2 -g 4 3 3 2 1 - -0,0: (1,1)=min(0+0,1+1,1+1)=0 S -1,0: (2,1)=min(1+1,0+1,2+1)=1 D -2,0: (3,1)=min(2+1,3+1,1+1)=2 D -3,0: (4,1)=min(3+1,4+1,2+1)=3 D -0,1: (1,2)=min(1+1,2+1,0+1)=1 D -1,1: (2,2)=min(0+1,1+1,1+1)=1 S -2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I -3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I -0,2: (1,3)=min(2+1,3+1,1+1)=2 I -1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I -2,2: (3,3) -3,2: (4,3) -At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?" - -We know the costs to transition: - -S[:i] -> T[:j] (at D[i,j]) -S[:i+1] -> T[:j] (at D[i+1,j]) -S[:i] -> T[:j+1] (at D[i,j+1]) - -Further, we now we can tranform: -S[:i+1] -> S[:i] (DEL) for 1, -T[:j+1] -> T[:j] (INS) for 1. -S[i+1] -> T[j+1] (SUB) for 0 or 1 - -Therefore we have the costs: -SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j]) -i.e. D[i, j] + S[i+1] != T[j+1] -INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j]) -i.e. D[i+1,j] + 1 -DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) -i.e. D[i,j+1] + 1 - - Source string S has length m, with index i - Target string T has length n, with index j - - Output two alignment vectors: i2j (length m) and j2i (length n) - # function LevenshteinDistance(char s[1..m], char t[1..n]): - # for all i and j, d[i,j] will hold the Levenshtein distance between - # the first i characters of s and the first j characters of t - # note that d has (m+1)*(n+1) values - # set each element in d to zero - ring rang - - r i n g - - 0 0 0 0 0 - r 0 0 0 0 0 - a 0 0 0 0 0 - n 0 0 0 0 0 - g 0 0 0 0 0 - - # source prefixes can be transformed into empty string by - # dropping all characters - # d[i, 0] := i - ring rang - - r i n g - - 0 0 0 0 0 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - - # target prefixes can be reached from empty source prefix - # by inserting every character - # d[0, j] := j - - r i n g - - 0 1 2 3 4 - r 1 0 0 0 0 - a 2 0 0 0 0 - n 3 0 0 0 0 - g 4 0 0 0 0 - -''' -from __future__ import unicode_literals -from libc.stdint cimport uint32_t -import numpy -cimport numpy as np -from .compat import unicode_ -from murmurhash.mrmr cimport hash32 - - -def align(S, T): - cdef int m = len(S) - cdef int n = len(T) - cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32') - cdef np.ndarray i2j = numpy.zeros((m,), dtype='i') - cdef np.ndarray j2i = numpy.zeros((n,), dtype='i') - - cdef np.ndarray S_arr = _convert_sequence(S) - cdef np.ndarray T_arr = _convert_sequence(T) - - fill_matrix(matrix.data, - S_arr.data, m, T_arr.data, n) - fill_i2j(i2j, matrix) - fill_j2i(j2i, matrix) - for i in range(i2j.shape[0]): - if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]): - i2j[i] = -1 - for j in range(j2i.shape[0]): - if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]): - j2i[j] = -1 - return matrix[-1,-1], i2j, j2i, matrix - - -def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths): - '''Let's say we had: - - Guess: [aa bb cc dd] - Truth: [aa bbcc dd] - i2j: [0, None, -2, 2] - j2i: [0, -2, 3] - - We want: - - i2j_multi: {1: 1, 2: 1} - j2i_multi: {} - ''' - i2j_miss = _get_regions(i2j, i_lengths) - j2i_miss = _get_regions(j2i, j_lengths) - - i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths) - return i2j_multi, j2i_multi - - -def _get_regions(alignment, lengths): - regions = {} - start = None - offset = 0 - for i in range(len(alignment)): - if alignment[i] < 0: - if start is None: - start = offset - regions.setdefault(start, []) - regions[start].append(i) - else: - start = None - offset += lengths[i] - return regions - - -def _get_mapping(miss1, miss2, lengths1, lengths2): - i2j = {} - j2i = {} - for start, region1 in miss1.items(): - if not region1 or start not in miss2: - continue - region2 = miss2[start] - if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2): - j = region2.pop(0) - buff = [] - # Consume tokens from region 1, until we meet the length of the - # first token in region2. If we do, align the tokens. If - # we exceed the length, break. - while region1: - buff.append(region1.pop(0)) - if sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - j += 1 - buff = [] - elif sum(lengths1[i] for i in buff) > lengths2[j]: - break - else: - if buff and sum(lengths1[i] for i in buff) == lengths2[j]: - for i in buff: - i2j[i] = j - j2i[j] = buff[-1] - return i2j, j2i - - -def _convert_sequence(seq): - if isinstance(seq, numpy.ndarray): - return numpy.ascontiguousarray(seq, dtype='uint32_t') - cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32') - cdef bytes item_bytes - for i, item in enumerate(seq): - if item == "``": - item = '"' - elif item == "''": - item = '"' - if isinstance(item, unicode): - item_bytes = item.encode('utf8') - else: - item_bytes = item - output[i] = hash32(item_bytes, len(item_bytes), 0) - return output - - -cdef void fill_matrix(int* D, - const int* S, int m, const int* T, int n) nogil: - m1 = m+1 - n1 = n+1 - for i in range(m1*n1): - D[i] = 0 - - for i in range(m1): - D[i*n1] = i - - for j in range(n1): - D[j] = j - - cdef int sub_cost, ins_cost, del_cost - for j in range(n): - for i in range(m): - i_j = i*n1 + j - i1_j1 = (i+1)*n1 + j+1 - i1_j = (i+1)*n1 + j - i_j1 = i*n1 + j+1 - if S[i] != T[j]: - sub_cost = D[i_j] + 1 - else: - sub_cost = D[i_j] - del_cost = D[i_j1] + 1 - ins_cost = D[i1_j] + 1 - best = min(min(sub_cost, ins_cost), del_cost) - D[i1_j1] = best - - -cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *: - j = D.shape[1]-2 - cdef int i = D.shape[0]-2 - while i >= 0: - while D[i+1, j] < D[i+1, j+1]: - j -= 1 - if D[i, j+1] < D[i+1, j+1]: - i2j[i] = -1 - else: - i2j[i] = j - j -= 1 - i -= 1 - -cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *: - i = D.shape[0]-2 - cdef int j = D.shape[1]-2 - while j >= 0: - while D[i, j+1] < D[i+1, j+1]: - i -= 1 - if D[i+1, j] < D[i+1, j+1]: - j2i[j] = -1 - else: - j2i[j] = i - i -= 1 - j -= 1 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0659ddd02..f2f127438 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -18,7 +18,6 @@ from .compat import path2str, basestring_ from . import util -USE_NEW_ALIGN = False punct_re = re.compile(r"\W") @@ -51,59 +50,15 @@ def tags_to_entities(tags): return entities -_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")] - - def _normalize_for_alignment(tokens): tokens = [w.replace(" ", "").lower() for w in tokens] output = [] for token in tokens: token = token.replace(" ", "").lower() - for before, after in _ALIGNMENT_NORM_MAP: - token = token.replace(before, after) output.append(token) return output -def _align_before_v2_2_2(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations, using the Levenshtein - algorithm. The alignment is case-insensitive. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - from . import _align - if tokens_a == tokens_b: - alignment = numpy.arange(len(tokens_a)) - return 0, alignment, alignment, {}, {} - tokens_a = [w.replace(" ", "").lower() for w in tokens_a] - tokens_b = [w.replace(" ", "").lower() for w in tokens_b] - cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], - [len(w) for w in tokens_b]) - for i, j in list(i2j_multi.items()): - if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: - i2j[i] = j - i2j_multi.pop(i) - for j, i in list(j2i_multi.items()): - if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: - j2i[j] = i - j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi - - def align(tokens_a, tokens_b): """Calculate alignment tables between two tokenizations. @@ -122,8 +77,6 @@ def align(tokens_a, tokens_b): * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other direction. """ - if not USE_NEW_ALIGN: - return _align_before_v2_2_2(tokens_a, tokens_b) tokens_a = _normalize_for_alignment(tokens_a) tokens_b = _normalize_for_alignment(tokens_b) cost = 0 diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py deleted file mode 100644 index d6bbab04e..000000000 --- a/spacy/tests/test_align.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy._align import align, multi_align - - -@pytest.mark.parametrize( - "string1,string2,cost", - [ - ("hello", "hell", 1), - ("rat", "cat", 1), - ("rat", "rat", 0), - ("rat", "catsie", 4), - ("t", "catsie", 5), - ], -) -def test_align_costs(string1, string2, cost): - output_cost, i2j, j2i, matrix = align(string1, string2) - assert output_cost == cost - - -@pytest.mark.parametrize( - "string1,string2,i2j", - [ - ("hello", "hell", [0, 1, 2, 3, -1]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2]), - ("t", "catsie", [2]), - ], -) -def test_align_i2j(string1, string2, i2j): - output_cost, output_i2j, j2i, matrix = align(string1, string2) - assert list(output_i2j) == i2j - - -@pytest.mark.parametrize( - "string1,string2,j2i", - [ - ("hello", "hell", [0, 1, 2, 3]), - ("rat", "cat", [0, 1, 2]), - ("rat", "rat", [0, 1, 2]), - ("rat", "catsie", [0, 1, 2, -1, -1, -1]), - ("t", "catsie", [-1, -1, 0, -1, -1, -1]), - ], -) -def test_align_i2j_2(string1, string2, j2i): - output_cost, output_i2j, output_j2i, matrix = align(string1, string2) - assert list(output_j2i) == j2i - - -def test_align_strings(): - words1 = ["hello", "this", "is", "test!"] - words2 = ["hellothis", "is", "test", "!"] - cost, i2j, j2i, matrix = align(words1, words2) - assert cost == 4 - assert list(i2j) == [-1, -1, 1, -1] - assert list(j2i) == [-1, 2, -1, -1] - - -def test_align_many_to_one(): - words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] - words2 = ["ab", "bc", "e", "fg", "h"] - cost, i2j, j2i, matrix = align(words1, words2) - assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] - lengths1 = [len(w) for w in words1] - lengths2 = [len(w) for w in words2] - i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2) - assert i2j_multi[0] == 0 - assert i2j_multi[1] == 0 - assert i2j_multi[2] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[3] == 1 - assert i2j_multi[5] == 3 - assert i2j_multi[6] == 3 - - assert j2i_multi[0] == 1 - assert j2i_multi[1] == 3 diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d1255c176..639d98859 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -241,20 +241,6 @@ def test_ignore_misaligned(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] - saved_use_new_align = spacy.gold.USE_NEW_ALIGN - - spacy.gold.USE_NEW_ALIGN = False - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" - data = [docs_to_json(doc)] - data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, data) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) - - spacy.gold.USE_NEW_ALIGN = True with make_tempdir() as tmpdir: jsonl_file = tmpdir / "test.jsonl" data = [docs_to_json(doc)] @@ -280,8 +266,6 @@ def test_ignore_misaligned(doc): ignore_misaligned=True)) assert len(train_reloaded_example) == 0 - spacy.gold.USE_NEW_ALIGN = saved_use_new_align - def test_make_orth_variants(doc): nlp = English() @@ -301,14 +285,12 @@ def test_make_orth_variants(doc): train_goldparse = train_reloaded_example.gold -# xfail while we have backwards-compatible alignment -@pytest.mark.xfail @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), ( - ["a", "b", "``", "c"], + ["a", "b", '"', "c"], ['ab"', "c"], (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ), From 9aab0a55e1aaa2544d1d1e294bd9e87f3db626de Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 26 Nov 2019 16:05:17 +0100 Subject: [PATCH 008/187] Fix conllu2json converter to output all sentences (#4716) Make sure that the last batch of sentences is output if n_sents > 1. --- spacy/cli/converters/conllu2json.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index ff720f4bf..c0fd58fb0 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -34,6 +34,9 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): doc = create_doc(sentences, i) docs.append(doc) sentences = [] + if sentences: + doc = create_doc(sentences, i) + docs.append(doc) return docs From 9efd3ccbef689230ec2ae53a6432f694d59b48ae Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 26 Nov 2019 16:10:08 +0100 Subject: [PATCH 009/187] Update conllu2json MISC column handling (#4715) Update converter to handle various things in MISC column: * `SpaceAfter=No` and set raw text accordingly * plain NER tag * name=NER (for NorNE) --- spacy/cli/converters/conllu2json.py | 68 +++++++++++++++++++---------- spacy/tests/test_cli.py | 27 +++++++++++- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index c0fd58fb0..7fa491b9d 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -18,21 +18,28 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): """ # by @dvsrepo, via #11 explosion/spacy-dev-resources # by @katarkor + # name=NER is to handle NorNE + MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" docs = [] + raw = "" sentences = [] conll_data = read_conllx(input_data, use_morphology=use_morphology) checked_for_ner = False has_ner_tags = False for i, example in enumerate(conll_data): if not checked_for_ner: - has_ner_tags = is_ner(example.token_annotation.entities[0]) + has_ner_tags = is_ner(example.token_annotation.entities[0], + MISC_NER_PATTERN) checked_for_ner = True - sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) + raw += example.text + sentences.append(generate_sentence(example.token_annotation, + has_ner_tags, MISC_NER_PATTERN)) # Real-sized documents could be extracted using the comments on the # conllu document if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) + doc = create_doc(raw, sentences, i) docs.append(doc) + raw = "" sentences = [] if sentences: doc = create_doc(sentences, i) @@ -40,12 +47,12 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): return docs -def is_ner(tag): +def is_ner(tag, tag_pattern): """ Check the 10th column of the first token to determine if the file contains NER tags """ - tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) + tag_match = re.search(tag_pattern, tag) if tag_match: return True elif tag == "O": @@ -63,9 +70,10 @@ def read_conllx(input_data, use_morphology=False, n=0): while lines[0].startswith("#"): lines.pop(0) ids, words, tags, heads, deps, ents = [], [], [], [], [], [] + spaces = [] for line in lines: parts = line.split("\t") - id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts if "-" in id_ or "." in id_: continue try: @@ -74,18 +82,27 @@ def read_conllx(input_data, use_morphology=False, n=0): dep = "ROOT" if dep == "root" else dep tag = pos if tag == "_" else tag tag = tag + "__" + morph if use_morphology else tag - iob = iob if iob else "O" + ent = misc if misc else "O" ids.append(id_) words.append(word) tags.append(tag) heads.append(head) deps.append(dep) - ents.append(iob) + ents.append(ent) + if "SpaceAfter=No" in misc: + spaces.append(False) + else: + spaces.append(True) except: # noqa: E722 print(line) raise - example = Example(doc=None) + raw = "" + for word, space in zip(words, spaces): + raw += word + if space: + raw += " " + example = Example(doc=raw) example.set_token_annotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents) yield example @@ -94,7 +111,7 @@ def read_conllx(input_data, use_morphology=False, n=0): break -def simplify_tags(iob): +def simplify_tags(iob, tag_pattern): """ Simplify tags obtained from the dataset in order to follow Wikipedia scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while @@ -103,26 +120,28 @@ def simplify_tags(iob): """ new_iob = [] for tag in iob: - tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) + tag_match = re.search(tag_pattern, tag) + new_tag = "O" if tag_match: - prefix = tag_match.group(1) - suffix = tag_match.group(2) - if suffix == "GPE_LOC": - suffix = "LOC" - elif suffix == "GPE_ORG": - suffix = "ORG" - elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": - suffix = "MISC" - tag = prefix + "-" + suffix - new_iob.append(tag) + prefix = tag_match.group(2) + suffix = tag_match.group(3) + if prefix and suffix: + if suffix == "GPE_LOC": + suffix = "LOC" + elif suffix == "GPE_ORG": + suffix = "ORG" + elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": + suffix = "MISC" + new_tag = prefix + "-" + suffix + new_iob.append(new_tag) return new_iob -def generate_sentence(token_annotation, has_ner_tags): +def generate_sentence(token_annotation, has_ner_tags, tag_pattern): sentence = {} tokens = [] if has_ner_tags: - iob = simplify_tags(token_annotation.entities) + iob = simplify_tags(token_annotation.entities, tag_pattern) biluo = iob_to_biluo(iob) for i, id in enumerate(token_annotation.ids): token = {} @@ -138,11 +157,12 @@ def generate_sentence(token_annotation, has_ner_tags): return sentence -def create_doc(sentences, id): +def create_doc(raw, sentences, id): doc = {} paragraph = {} doc["id"] = id doc["paragraphs"] = [] + paragraph["raw"] = raw.strip() paragraph["sentences"] = sentences doc["paragraphs"].append(paragraph) return doc diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 6dce649a9..2ce76b9ba 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -32,6 +32,32 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] +def test_cli_converters_conllu2json(): + # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + lines = [ + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", + ] + input_data = "\n".join(lines) + converted = conllu2json(input_data, n_sents=1) + assert len(converted) == 1 + assert converted[0]["id"] == 0 + assert len(converted[0]["paragraphs"]) == 1 + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." + assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 + sent = converted[0]["paragraphs"][0]["sentences"][0] + assert len(sent["tokens"]) == 5 + tokens = sent["tokens"] + assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] + assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] + assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] + assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] + assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] + + def test_cli_converters_iob2json(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -106,7 +132,6 @@ def test_cli_converters_conll_ner2json(): ] input_data = "\n".join(lines) converted = conll_ner2json(input_data, n_sents=10) - print(converted) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 From b841d3fe75f099b4bceba512c748f335211cab52 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 28 Nov 2019 11:10:07 +0100 Subject: [PATCH 010/187] Add a tagger-based SentenceRecognizer (#4713) * Add sent_starts to GoldParse * Add SentTagger pipeline component Add `SentTagger` pipeline component as a subclass of `Tagger`. * Model reduces default parameters from `Tagger` to be small and fast * Hard-coded set of two labels: * S (1): token at beginning of sentence * I (0): all other sentence positions * Sets `token.sent_start` values * Add sentence segmentation to Scorer Report `sent_p/r/f` for sentence boundaries, which may be provided by various pipeline components. * Add sentence segmentation to CLI evaluate * Add senttagger metrics/scoring to train CLI * Rename SentTagger to SentenceRecognizer * Add SentenceRecognizer to spacy.pipes imports * Add SentenceRecognizer serialization test * Shorten component name to sentrec * Remove duplicates from train CLI output metrics --- spacy/cli/evaluate.py | 3 + spacy/cli/train.py | 20 ++- spacy/gold.pxd | 1 + spacy/gold.pyx | 41 ++--- spacy/pipeline/__init__.py | 2 + spacy/pipeline/pipes.pyx | 165 +++++++++++++++++- spacy/scorer.py | 34 +++- .../serialize/test_serialize_pipeline.py | 9 +- 8 files changed, 245 insertions(+), 30 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index a3193a5cf..da8a714a7 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -61,6 +61,9 @@ def evaluate( "NER R": "%.2f" % scorer.ents_r, "NER F": "%.2f" % scorer.ents_f, "Textcat": "%.2f" % scorer.textcat_score, + "Sent P": "%.2f" % scorer.sent_p, + "Sent R": "%.2f" % scorer.sent_r, + "Sent F": "%.2f" % scorer.sent_f, } msg.table(results, title="Results") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 645d1e4d4..8d37254a5 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -11,6 +11,7 @@ import srsly from wasabi import msg import contextlib import random +from collections import OrderedDict from .._ml import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG @@ -585,11 +586,13 @@ def _find_best(experiment_dir, component): def _get_metrics(component): if component == "parser": - return ("las", "uas", "token_acc") + return ("las", "uas", "token_acc", "sent_f") elif component == "tagger": return ("tags_acc",) elif component == "ner": return ("ents_f", "ents_p", "ents_r") + elif component == "sentrec": + return ("sent_p", "sent_r", "sent_f",) return ("token_acc",) @@ -601,14 +604,17 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): row_head.extend(["Tag Loss ", " Tag % "]) output_stats.extend(["tag_loss", "tags_acc"]) elif pipe == "parser": - row_head.extend(["Dep Loss ", " UAS ", " LAS "]) - output_stats.extend(["dep_loss", "uas", "las"]) + row_head.extend(["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]) + output_stats.extend(["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]) elif pipe == "ner": row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "]) output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"]) elif pipe == "textcat": row_head.extend(["Textcat Loss", "Textcat"]) output_stats.extend(["textcat_loss", "textcat_score"]) + elif pipe == "sentrec": + row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"]) + output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"]) row_head.extend(["Token %", "CPU WPS"]) output_stats.extend(["token_acc", "cpu_wps"]) @@ -618,7 +624,12 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): if has_beam_widths: row_head.insert(1, "Beam W.") - return row_head, output_stats + # remove duplicates + row_head_dict = OrderedDict() + row_head_dict.update({k: 1 for k in row_head}) + output_stats_dict = OrderedDict() + output_stats_dict.update({k: 1 for k in output_stats}) + return row_head_dict.keys(), output_stats_dict.keys() def _get_progress( @@ -631,6 +642,7 @@ def _get_progress( scores["ner_loss"] = losses.get("ner", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0) scores["textcat_loss"] = losses.get("textcat", 0.0) + scores["sentrec_loss"] = losses.get("sentrec", 0.0) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 scores.update(dev_scores) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 247ff8aa1..525aa2473 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -26,6 +26,7 @@ cdef class GoldParse: cdef public list words cdef public list tags cdef public list morphs + cdef public list sent_starts cdef public list heads cdef public list labels cdef public dict orths diff --git a/spacy/gold.pyx b/spacy/gold.pyx index f2f127438..a7c0f1d8d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -497,9 +497,9 @@ def json_to_examples(doc): ner.append(token.get("ner", "-")) morphs.append(token.get("morph", {})) if i == 0: - sent_starts.append(True) + sent_starts.append(1) else: - sent_starts.append(False) + sent_starts.append(0) if "brackets" in sent: brackets.extend((b["first"] + sent_start_i, b["last"] + sent_start_i, b["label"]) @@ -759,7 +759,7 @@ cdef class Example: t = self.token_annotation split_examples = [] for i in range(len(t.words)): - if i > 0 and t.sent_starts[i] == True: + if i > 0 and t.sent_starts[i] == 1: s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, entities=s_ents, morphs=s_morphs, @@ -892,6 +892,7 @@ cdef class GoldParse: deps=token_annotation.deps, entities=token_annotation.entities, morphs=token_annotation.morphs, + sent_starts=token_annotation.sent_starts, cats=doc_annotation.cats, links=doc_annotation.links, make_projective=make_projective) @@ -902,12 +903,13 @@ cdef class GoldParse: ids = list(range(len(self.words))) return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, - heads=self.heads, deps=self.labels, entities=self.ner, - morphs=self.morphs) + heads=self.heads, deps=self.labels, + entities=self.ner, morphs=self.morphs, + sent_starts=self.sent_starts) def __init__(self, doc, words=None, tags=None, morphs=None, - heads=None, deps=None, entities=None, make_projective=False, - cats=None, links=None): + heads=None, deps=None, entities=None, sent_starts=None, + make_projective=False, cats=None, links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. doc (Doc): The document the annotations refer to. @@ -920,6 +922,8 @@ cdef class GoldParse: entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. + sent_starts (iterable): A sequence of sentence position tags, 1 for + the first word in a sentence, 0 for all others. cats (dict): Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the @@ -956,6 +960,8 @@ cdef class GoldParse: deps = [None for _ in words] if not morphs: morphs = [None for _ in words] + if not sent_starts: + sent_starts = [None for _ in words] if entities is None: entities = ["-" for _ in words] elif len(entities) == 0: @@ -982,6 +988,7 @@ cdef class GoldParse: self.labels = [None] * len(doc) self.ner = [None] * len(doc) self.morphs = [None] * len(doc) + self.sent_starts = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -1000,7 +1007,7 @@ cdef class GoldParse: self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, - heads=heads, deps=deps, entities=entities, morphs=morphs, + heads=heads, deps=deps, entities=entities, morphs=morphs, sent_starts=sent_starts, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): @@ -1011,11 +1018,13 @@ cdef class GoldParse: self.labels[i] = None self.ner[i] = None self.morphs[i] = set() + self.sent_starts[i] = 0 if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] self.morphs[i] = morphs[i2j_multi[i]] + self.sent_starts[i] = sent_starts[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last @@ -1055,6 +1064,7 @@ cdef class GoldParse: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] self.morphs[i] = morphs[gold_i] + self.sent_starts[i] = sent_starts[gold_i] if heads[gold_i] is None: self.heads[i] = None else: @@ -1091,21 +1101,6 @@ cdef class GoldParse: """ return not nonproj.is_nonproj_tree(self.heads) - property sent_starts: - def __get__(self): - return [self.c.sent_start[i] for i in range(self.length)] - - def __set__(self, sent_starts): - for gold_i, is_sent_start in enumerate(sent_starts): - i = self.gold_to_cand[gold_i] - if i is not None: - if is_sent_start in (1, True): - self.c.sent_start[i] = 1 - elif is_sent_start in (-1, False): - self.c.sent_start[i] = -1 - else: - self.c.sent_start[i] = 0 - def docs_to_json(docs, id=0): """Convert a list of Doc objects into the JSON-serializable format used by diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 2f30fbbee..de8403152 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import SentenceRecognizer from .morphologizer import Morphologizer from .entityruler import EntityRuler from .hooks import SentenceSegmenter, SimilarityHook @@ -20,6 +21,7 @@ __all__ = [ "EntityRuler", "Sentencizer", "SentenceSegmenter", + "SentenceRecognizer", "SimilarityHook", "merge_entities", "merge_noun_chunks", diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 56a00e33b..110839acd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -705,6 +705,169 @@ class Tagger(Pipe): return self +@component("sentrec", assigns=["token.is_sent_start"]) +class SentenceRecognizer(Tagger): + """Pipeline component for sentence segmentation. + + DOCS: https://spacy.io/api/sentencerecognizer + """ + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self._rehearsal_model = None + self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg.setdefault("cnn_maxout_pieces", 2) + self.cfg.setdefault("subword_features", True) + self.cfg.setdefault("token_vector_width", 12) + self.cfg.setdefault("conv_depth", 1) + self.cfg.setdefault("pretrained_vectors", None) + + @property + def labels(self): + # labels are numbered by index internally, so this matches GoldParse + # and Example where the sentence-initial tag is 1 and other positions + # are 0 + return tuple(["I", "S"]) + + def set_annotations(self, docs, batch_tag_ids, **_): + if isinstance(docs, Doc): + docs = [docs] + cdef Doc doc + for i, doc in enumerate(docs): + doc_tag_ids = batch_tag_ids[i] + if hasattr(doc_tag_ids, "get"): + doc_tag_ids = doc_tag_ids.get() + for j, tag_id in enumerate(doc_tag_ids): + # Don't clobber existing sentence boundaries + if doc.c[j].sent_start == 0: + if tag_id == 1: + doc.c[j].sent_start = 1 + else: + doc.c[j].sent_start = -1 + + def update(self, examples, drop=0., sgd=None, losses=None): + self.require_model() + examples = Example.to_example_objects(examples) + if losses is not None and self.name not in losses: + losses[self.name] = 0. + + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + # Handle cases where there are no tokens in any docs. + return + + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + loss, d_tag_scores = self.get_loss(examples, tag_scores) + bp_tag_scores(d_tag_scores, sgd=sgd) + + if losses is not None: + losses[self.name] += loss + + def get_loss(self, examples, scores): + scores = self.model.ops.flatten(scores) + tag_index = range(len(self.labels)) + cdef int idx = 0 + correct = numpy.zeros((scores.shape[0],), dtype="i") + guesses = scores.argmax(axis=1) + known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + for ex in examples: + gold = ex.gold + for sent_start in gold.sent_starts: + if sent_start is None: + correct[idx] = guesses[idx] + elif sent_start in tag_index: + correct[idx] = sent_start + else: + correct[idx] = 0 + known_labels[idx] = 0. + idx += 1 + correct = self.model.ops.xp.array(correct, dtype="i") + d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores *= self.model.ops.asarray(known_labels) + loss = (d_scores**2).sum() + docs = [ex.doc for ex in examples] + d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + return float(loss), d_scores + + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, + **kwargs): + cdef Vocab vocab = self.vocab + if self.model is True: + for hp in ["token_vector_width", "conv_depth"]: + if hp in kwargs: + self.cfg[hp] = kwargs[hp] + self.model = self.Model(len(self.labels), **self.cfg) + if sgd is None: + sgd = self.create_optimizer() + return sgd + + @classmethod + def Model(cls, n_tags, **cfg): + return build_tagger_model(n_tags, **cfg) + + def add_label(self, label, values=None): + raise NotImplementedError + + def use_params(self, params): + with self.model.use_params(params): + yield + + def to_bytes(self, exclude=tuple(), **kwargs): + serialize = OrderedDict() + if self.model not in (None, True, False): + serialize["model"] = self.model.to_bytes + serialize["vocab"] = self.vocab.to_bytes + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def load_model(b): + if self.model is True: + self.model = self.Model(len(self.labels), **self.cfg) + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = OrderedDict(( + ("vocab", lambda b: self.vocab.from_bytes(b)), + ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), + ("model", lambda b: load_model(b)), + )) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + serialize = OrderedDict(( + ("vocab", lambda p: self.vocab.to_disk(p)), + ("model", lambda p: p.open("wb").write(self.model.to_bytes())), + ("cfg", lambda p: srsly.write_json(p, self.cfg)) + )) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude=tuple(), **kwargs): + def load_model(p): + if self.model is True: + self.model = self.Model(len(self.labels), **self.cfg) + with p.open("rb") as file_: + try: + self.model.from_bytes(file_.read()) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = OrderedDict(( + ("cfg", lambda p: self.cfg.update(_load_cfg(p))), + ("vocab", lambda p: self.vocab.from_disk(p)), + ("model", load_model), + )) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_disk(path, deserialize, exclude) + return self + + @component("nn_labeller") class MultitaskObjective(Tagger): """Experimental: Assist training of a parser or tagger, by training a @@ -1589,4 +1752,4 @@ Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/scorer.py b/spacy/scorer.py index 723259acd..d2878da1a 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -84,6 +84,7 @@ class Scorer(object): self.labelled = PRFScore() self.labelled_per_dep = dict() self.tags = PRFScore() + self.sent_starts = PRFScore() self.ner = PRFScore() self.ner_per_ents = dict() self.eval_punct = eval_punct @@ -113,6 +114,27 @@ class Scorer(object): """ return self.tags.fscore * 100 + @property + def sent_p(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.precision * 100 + + @property + def sent_r(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.recall * 100 + + @property + def sent_f(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.fscore * 100 + @property def token_acc(self): """RETURNS (float): Tokenization accuracy.""" @@ -212,6 +234,9 @@ class Scorer(object): "ents_f": self.ents_f, "ents_per_type": self.ents_per_type, "tags_acc": self.tags_acc, + "sent_p": self.sent_p, + "sent_r": self.sent_r, + "sent_f": self.sent_f, "token_acc": self.token_acc, "textcat_score": self.textcat_score, "textcats_per_cat": self.textcats_per_cat, @@ -242,9 +267,12 @@ class Scorer(object): gold_deps = set() gold_deps_per_dep = {} gold_tags = set() + gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, head, dep in zip(orig.ids, orig.tags, orig.heads, orig.deps): + for id_, tag, head, dep, sent_start in zip(orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts): gold_tags.add((id_, tag)) + if sent_start: + gold_sent_starts.add(id_) if dep not in (None, "") and dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) if dep.lower() not in self.labelled_per_dep: @@ -255,6 +283,7 @@ class Scorer(object): cand_deps = set() cand_deps_per_dep = {} cand_tags = set() + cand_sent_starts = set() for token in doc: if token.orth_.isspace(): continue @@ -264,6 +293,8 @@ class Scorer(object): else: self.tokens.tp += 1 cand_tags.add((gold_i, token.tag_)) + if token.is_sent_start: + cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): gold_head = gold.cand_to_gold[token.head.i] # None is indistinct, so we can't just add it to the set @@ -308,6 +339,7 @@ class Scorer(object): # Score for all ents self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) + self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index efa7ef625..797fa95f8 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer +from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer from ..util import make_tempdir @@ -144,3 +144,10 @@ def test_serialize_pipe_exclude(en_vocab, Parser): parser.to_bytes(cfg=False, exclude=["vocab"]) with pytest.raises(ValueError): get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False) + + +def test_serialize_sentencerecognizer(en_vocab): + sr = SentenceRecognizer(en_vocab) + sr_b = sr.to_bytes() + sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b) + assert sr.to_bytes() == sr_d.to_bytes() From 79ba1a3b921f76c80dadc77a8d0a01bc630b3721 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 28 Nov 2019 14:53:44 +0100 Subject: [PATCH 011/187] Add lemmas to GoldParse / Example / docs_to_json (#4726) --- spacy/gold.pxd | 2 ++ spacy/gold.pyx | 52 +++++++++++++++++++++++++++------------- spacy/tests/test_gold.py | 6 +++++ 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 525aa2473..5f0b49c9f 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -26,6 +26,7 @@ cdef class GoldParse: cdef public list words cdef public list tags cdef public list morphs + cdef public list lemmas cdef public list sent_starts cdef public list heads cdef public list labels @@ -47,6 +48,7 @@ cdef class TokenAnnotation: cdef public list deps cdef public list entities cdef public list morphs + cdef public list lemmas cdef public list sent_starts cdef public list brackets diff --git a/spacy/gold.pyx b/spacy/gold.pyx index a7c0f1d8d..00ae7c5e8 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -481,6 +481,7 @@ def json_to_examples(doc): labels = [] ner = [] morphs = [] + lemmas = [] sent_starts = [] brackets = [] for sent in paragraph["sentences"]: @@ -496,6 +497,7 @@ def json_to_examples(doc): labels[-1] = "ROOT" ner.append(token.get("ner", "-")) morphs.append(token.get("morph", {})) + lemmas.append(token.get("lemma", "")) if i == 0: sent_starts.append(1) else: @@ -509,7 +511,7 @@ def json_to_examples(doc): cats[cat["label"]] = cat["value"] example.set_token_annotation(ids=ids, words=words, tags=tags, heads=heads, deps=labels, entities=ner, morphs=morphs, - sent_starts=sent_starts, brackets=brackets) + lemmas=lemmas, sent_starts=sent_starts, brackets=brackets) example.set_doc_annotation(cats=cats) yield example @@ -618,7 +620,9 @@ def _consume_ent(tags): cdef class TokenAnnotation: - def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None): + def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, + entities=None, morphs=None, lemmas=None, sent_starts=None, + brackets=None): self.ids = ids if ids else [] self.words = words if words else [] self.tags = tags if tags else [] @@ -626,6 +630,7 @@ cdef class TokenAnnotation: self.deps = deps if deps else [] self.entities = entities if entities else [] self.morphs = morphs if morphs else [] + self.lemmas = lemmas if lemmas else [] self.sent_starts = sent_starts if sent_starts else [] self.brackets = brackets if brackets else [] @@ -638,6 +643,7 @@ cdef class TokenAnnotation: deps=token_dict.get("deps", None), entities=token_dict.get("entities", None), morphs=token_dict.get("morphs", None), + lemmas=token_dict.get("lemmas", None), sent_starts=token_dict.get("sent_starts", None), brackets=token_dict.get("brackets", None)) @@ -649,6 +655,7 @@ cdef class TokenAnnotation: "deps": self.deps, "entities": self.entities, "morphs": self.morphs, + "lemmas": self.lemmas, "sent_starts": self.sent_starts, "brackets": self.brackets} @@ -673,6 +680,9 @@ cdef class TokenAnnotation: def get_morph(self, i): return self.morphs[i] if i < len(self.morphs) else set() + def get_lemma(self, i): + return self.lemmas[i] if i < len(self.lemmas) else "" + def get_sent_start(self, i): return self.sent_starts[i] if i < len(self.sent_starts) else None @@ -735,12 +745,12 @@ cdef class Example: return self.goldparse def set_token_annotation(self, ids=None, words=None, tags=None, heads=None, - deps=None, entities=None, morphs=None, + deps=None, entities=None, morphs=None, lemmas=None, sent_starts=None, brackets=None): self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=entities, - morphs=morphs, sent_starts=sent_starts, - brackets=brackets) + morphs=morphs, lemmas=lemmas, + sent_starts=sent_starts, brackets=brackets) def set_doc_annotation(self, cats=None, links=None): if cats: @@ -753,7 +763,7 @@ cdef class Example: sent_starts and return a list of the new Examples""" s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_heads = [], [], [], [] - s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] + s_deps, s_ents, s_morphs, s_lemmas, s_sent_starts = [], [], [], [], [] s_brackets = [] sent_start_i = 0 t = self.token_annotation @@ -762,13 +772,13 @@ cdef class Example: if i > 0 and t.sent_starts[i] == 1: s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, - entities=s_ents, morphs=s_morphs, + entities=s_ents, morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts, brackets=s_brackets) split_examples.append(s_example) s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_heads = [], [], [], [] - s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] - s_brackets = [] + s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] + s_sent_starts, s_brackets = [], [] sent_start_i = i s_ids.append(t.get_id(i)) s_words.append(t.get_word(i)) @@ -777,6 +787,7 @@ cdef class Example: s_deps.append(t.get_dep(i)) s_ents.append(t.get_entity(i)) s_morphs.append(t.get_morph(i)) + s_lemmas.append(t.get_lemma(i)) s_sent_starts.append(t.get_sent_start(i)) s_brackets.extend((b[0] - sent_start_i, b[1] - sent_start_i, b[2]) @@ -784,7 +795,7 @@ cdef class Example: i += 1 s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, entities=s_ents, - morphs=s_morphs, sent_starts=s_sent_starts, + morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts, brackets=s_brackets) split_examples.append(s_example) return split_examples @@ -892,6 +903,7 @@ cdef class GoldParse: deps=token_annotation.deps, entities=token_annotation.entities, morphs=token_annotation.morphs, + lemmas=token_annotation.lemmas, sent_starts=token_annotation.sent_starts, cats=doc_annotation.cats, links=doc_annotation.links, @@ -905,10 +917,10 @@ cdef class GoldParse: return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, heads=self.heads, deps=self.labels, entities=self.ner, morphs=self.morphs, - sent_starts=self.sent_starts) + sent_starts=self.sent_starts, lemmas=self.lemmas) - def __init__(self, doc, words=None, tags=None, morphs=None, - heads=None, deps=None, entities=None, sent_starts=None, + def __init__(self, doc, words=None, tags=None, morphs=None, lemmas=None, + sent_starts=None, heads=None, deps=None, entities=None, make_projective=False, cats=None, links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. @@ -960,6 +972,8 @@ cdef class GoldParse: deps = [None for _ in words] if not morphs: morphs = [None for _ in words] + if not lemmas: + lemmas = [None for _ in words] if not sent_starts: sent_starts = [None for _ in words] if entities is None: @@ -988,6 +1002,7 @@ cdef class GoldParse: self.labels = [None] * len(doc) self.ner = [None] * len(doc) self.morphs = [None] * len(doc) + self.lemmas = [None] * len(doc) self.sent_starts = [None] * len(doc) # This needs to be done before we align the words @@ -1006,9 +1021,10 @@ cdef class GoldParse: self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, - heads=heads, deps=deps, entities=entities, morphs=morphs, sent_starts=sent_starts, - brackets=[]) + self.orig = TokenAnnotation(ids=list(range(len(words))), + words=words, tags=tags, heads=heads, deps=deps, + entities=entities, morphs=morphs, lemmas=lemmas, + sent_starts=sent_starts, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): if doc[i].text.isspace(): @@ -1018,12 +1034,14 @@ cdef class GoldParse: self.labels[i] = None self.ner[i] = None self.morphs[i] = set() + self.lemmas[i] = None self.sent_starts[i] = 0 if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] self.morphs[i] = morphs[i2j_multi[i]] + self.lemmas[i] = lemmas[i2j_multi[i]] self.sent_starts[i] = sent_starts[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) @@ -1064,6 +1082,7 @@ cdef class GoldParse: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] self.morphs[i] = morphs[gold_i] + self.lemmas[i] = lemmas[gold_i] self.sent_starts[i] = sent_starts[gold_i] if heads[gold_i] is None: self.heads[i] = None @@ -1125,6 +1144,7 @@ def docs_to_json(docs, id=0): json_sent = {"tokens": [], "brackets": []} for token in sent: json_token = {"id": token.i, "orth": token.text} + json_token["lemma"] = token.lemma_ if doc.is_tagged: json_token["tag"] = token.tag_ if doc.is_parsed: diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 639d98859..9d644d062 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -21,6 +21,7 @@ def doc(): # head of '.' is intentionally nonprojective for testing heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] + lemmas = ['Sarah', "'s", 'sister', 'fly', 'to', 'Silicon', 'Valley', 'via', 'London', '.'] biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() @@ -29,6 +30,7 @@ def doc(): doc[i].tag_ = tags[i] doc[i].dep_ = deps[i] doc[i].head = doc[heads[i]] + doc[i].lemma_ = lemmas[i] doc.ents = spans_from_biluo_tags(doc, biluo_tags) doc.cats = cats doc.is_tagged = True @@ -138,6 +140,7 @@ def test_roundtrip_docs_to_json(doc): tags = [t.tag_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] + lemmas = [t.lemma_ for t in doc] biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]) cats = doc.cats @@ -155,6 +158,7 @@ def test_roundtrip_docs_to_json(doc): assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads + assert lemmas == goldparse.lemmas assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats @@ -175,6 +179,7 @@ def test_roundtrip_docs_to_json(doc): assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads + assert lemmas == goldparse.lemmas assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats @@ -199,6 +204,7 @@ def test_roundtrip_docs_to_json(doc): assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads + assert lemmas == goldparse.lemmas assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats From 68f711b4097d027513380421565225b73bfcf907 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 29 Nov 2019 10:22:03 +0100 Subject: [PATCH 012/187] Fix conllu2json n_sents and raw text (#4728) Update conllu2json converter to include raw text in final batch. --- spacy/cli/converters/conllu2json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 7fa491b9d..dc68efef4 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -42,7 +42,7 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): raw = "" sentences = [] if sentences: - doc = create_doc(sentences, i) + doc = create_doc(raw, sentences, i) docs.append(doc) return docs From eb9b1858c4c218a74c58a806334b6b237e144bb8 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 11 Dec 2019 18:20:49 +0100 Subject: [PATCH 013/187] Add NER map option to convert CLI (#4763) Instead of a hard-coded NER tag simplification function that was only intended for NorNE, map NER tags in CoNLL-U converter using a dict provided as JSON as a command-line option. Map NER entity types or new tag or to "" for 'O', e.g.: ``` {"PER": "PERSON", "BAD": ""} => B-PER -> B-PERSON B-BAD -> O ``` --- spacy/cli/convert.py | 6 ++++++ spacy/cli/converters/conllu2json.py | 30 ++++++++++++++++++----------- spacy/tests/test_cli.py | 11 +++++------ 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index fa867fa04..0cc0693a8 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl") converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool), + ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path), ) def convert( input_file, @@ -49,6 +50,7 @@ def convert( model=None, morphology=False, converter="auto", + ner_map_path=None, lang=None, ): """ @@ -94,6 +96,9 @@ def convert( ) if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) + ner_map = None + if ner_map_path is not None: + ner_map = srsly.read_json(ner_map_path) # Use converter function to convert data func = CONVERTERS[converter] data = func( @@ -104,6 +109,7 @@ def convert( lang=lang, model=model, no_print=no_print, + ner_map=ner_map, ) if output_dir != "-": # Export data to a file diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index dc68efef4..0699bb5c1 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -7,7 +7,8 @@ from spacy.gold import Example from ...gold import iob_to_biluo -def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): +def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, + ner_map=None, **_): """ Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is @@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): checked_for_ner = True raw += example.text sentences.append(generate_sentence(example.token_annotation, - has_ner_tags, MISC_NER_PATTERN)) + has_ner_tags, MISC_NER_PATTERN, + ner_map=ner_map)) # Real-sized documents could be extracted using the comments on the # conllu document if len(sentences) % n_sents == 0: @@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0): break -def simplify_tags(iob, tag_pattern): +def extract_tags(iob, tag_pattern, ner_map=None): """ + Extract tag from MISC column according to `tag_pattern` and map to final + entity type with `ner_map` if mapping present. + + For NorNE: Simplify tags obtained from the dataset in order to follow Wikipedia scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to @@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern): prefix = tag_match.group(2) suffix = tag_match.group(3) if prefix and suffix: - if suffix == "GPE_LOC": - suffix = "LOC" - elif suffix == "GPE_ORG": - suffix = "ORG" - elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": - suffix = "MISC" new_tag = prefix + "-" + suffix + if ner_map: + suffix = ner_map.get(suffix, suffix) + if suffix == "": + new_tag = "O" + else: + new_tag = prefix + "-" + suffix new_iob.append(new_tag) return new_iob -def generate_sentence(token_annotation, has_ner_tags, tag_pattern): +def generate_sentence(token_annotation, has_ner_tags, tag_pattern, + ner_map=None): sentence = {} tokens = [] if has_ner_tags: - iob = simplify_tags(token_annotation.entities, tag_pattern) + iob = extract_tags(token_annotation.entities, tag_pattern, + ner_map=ner_map) biluo = iob_to_biluo(iob) for i, id in enumerate(token_annotation.ids): token = {} diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 2ce76b9ba..3b75e760a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs def test_cli_converters_conllu2json(): - # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", @@ -32,17 +32,16 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] -def test_cli_converters_conllu2json(): - # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu +def test_cli_converters_conllu2json_name_ner_map(): lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", - "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", ] input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1) + converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 @@ -55,7 +54,7 @@ def test_cli_converters_conllu2json(): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] - assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] + assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] def test_cli_converters_iob2json(): From a4cacd3402848299444c477cb2f1a292425b29af Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 13 Dec 2019 10:46:18 +0100 Subject: [PATCH 014/187] Add tag_map argument to CLI debug-data and train (#4750) Add an argument for a path to a JSON-formatted tag map, which is used to update and extend the default language tag map. --- spacy/cli/debug_data.py | 10 +++++++++- spacy/cli/train.py | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index ed19703ac..c2af5bff0 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000 lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), pipeline=( "Comma-separated names of pipeline components to train", @@ -41,6 +42,7 @@ def debug_data( lang, train_path, dev_path, + tag_map_path=None, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, @@ -60,6 +62,10 @@ def debug_data( if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: @@ -67,6 +73,8 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) msg.divider("Data format validation") @@ -329,7 +337,7 @@ def debug_data( if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] - tag_map = nlp.Defaults.tag_map + tag_map = nlp.vocab.morphology.tag_map msg.info( "{} {} in data ({} {} in tag map)".format( len(labels), diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8d37254a5..cdcbed0b3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -48,6 +48,7 @@ from .. import about textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), # fmt: on @@ -78,6 +79,7 @@ def train( textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, + tag_map_path=None, verbose=False, debug=False, ): @@ -118,6 +120,9 @@ def train( if not output_path.exists(): output_path.mkdir() + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly @@ -209,6 +214,9 @@ def train( pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) + if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) From d17e7dca9ee36a33c6d23bc8786b3c76ca9dc061 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 19:57:41 +0100 Subject: [PATCH 015/187] Fix problems caused by merge conflict --- spacy/scorer.py | 2 +- spacy/tokenizer.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 7fee4865a..6238b6ead 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -313,7 +313,7 @@ class Scorer(object): cand_deps_per_dep[token.dep_.lower()].add( (gold_i, gold_head, token.dep_.lower()) ) - if "-" not in [token[-1] for token in gold.orig_annot]: + if "-" not in [token[-1] for token in orig.entities]: # Find all NER labels in gold and doc ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) # Set up all labels for per type scoring and prepare gold per type diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index c1ac3dd06..f0120c708 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -116,10 +116,10 @@ cdef class Tokenizer: def __set__(self, rules): self._rules = {} self._reset_cache([key for key in self._cache]) - self._reset_specials() + self._flush_specials() self._cache = PreshMap() self._specials = PreshMap() - self._load_special_tokenization(rules) + self._load_special_cases(rules) def __reduce__(self): args = (self.vocab, From 21b6d6e0a8287e425b3fffd08309596d5dd1e6ca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 21:17:31 +0100 Subject: [PATCH 016/187] Fix typo --- spacy/tests/regression/test_issue4674.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 8d0c32eaa..8fa4f9259 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -6,7 +6,7 @@ from spacy.kb import KnowledgeBase from spacy.util import ensure_path from spacy.lang.en import English -from ..tests.util import make_tempdir +from ..util import make_tempdir def test_issue4674(): From db55577c452cbb0e9c984dcc2bce7ecaf99ad3c8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 22 Dec 2019 01:53:56 +0100 Subject: [PATCH 017/187] Drop Python 2.7 and 3.5 (#4828) * Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip] --- .travis.yml | 23 -- CONTRIBUTING.md | 18 +- README.md | 7 +- azure-pipelines.yml | 6 - bin/cythonize.py | 6 +- fabfile.py | 3 - requirements.txt | 1 - setup.cfg | 6 +- setup.py | 1 - spacy/__init__.py | 2 - spacy/__main__.py | 10 +- spacy/_ml.py | 3 - spacy/analysis.py | 19 +- spacy/attrs.pyx | 3 - spacy/cli/_schemas.py | 3 - spacy/cli/convert.py | 27 +- spacy/cli/converters/conll_ner2json.py | 13 +- spacy/cli/converters/conllu2json.py | 3 - spacy/cli/converters/iob2json.py | 3 - spacy/cli/converters/jsonl2json.py | 3 - spacy/cli/debug_data.py | 283 ++++++------------ spacy/cli/download.py | 29 +- spacy/cli/evaluate.py | 5 +- spacy/cli/info.py | 20 +- spacy/cli/init_model.py | 10 +- spacy/cli/link.py | 25 +- spacy/cli/package.py | 21 +- spacy/cli/pretrain.py | 15 +- spacy/cli/profile.py | 11 +- spacy/cli/train.py | 83 +++-- spacy/cli/validate.py | 26 +- spacy/compat.py | 102 +------ spacy/displacy/__init__.py | 14 +- spacy/displacy/render.py | 5 +- spacy/displacy/templates.py | 3 - spacy/errors.py | 14 +- spacy/glossary.py | 3 - spacy/gold.pyx | 18 +- spacy/kb.pyx | 13 +- spacy/lang/af/__init__.py | 3 - spacy/lang/af/stop_words.py | 3 - spacy/lang/ar/__init__.py | 3 - spacy/lang/ar/examples.py | 3 - spacy/lang/ar/lex_attrs.py | 2 - spacy/lang/ar/punctuation.py | 3 - spacy/lang/ar/stop_words.py | 3 - spacy/lang/ar/tokenizer_exceptions.py | 3 - spacy/lang/bg/__init__.py | 3 - spacy/lang/bg/examples.py | 3 - spacy/lang/bg/stop_words.py | 3 - spacy/lang/bn/__init__.py | 3 - spacy/lang/bn/examples.py | 3 - spacy/lang/bn/morph_rules.py | 3 - spacy/lang/bn/punctuation.py | 3 - spacy/lang/bn/stop_words.py | 3 - spacy/lang/bn/tag_map.py | 3 - spacy/lang/bn/tokenizer_exceptions.py | 3 - spacy/lang/ca/__init__.py | 3 - spacy/lang/ca/examples.py | 3 - spacy/lang/ca/lex_attrs.py | 3 - spacy/lang/ca/punctuation.py | 3 - spacy/lang/ca/stop_words.py | 4 - spacy/lang/ca/tag_map.py | 3 - spacy/lang/ca/tokenizer_exceptions.py | 3 - spacy/lang/char_classes.py | 3 - spacy/lang/cs/__init__.py | 3 - spacy/lang/cs/stop_words.py | 3 - spacy/lang/da/__init__.py | 3 - spacy/lang/da/examples.py | 3 - spacy/lang/da/lex_attrs.py | 3 - spacy/lang/da/morph_rules.py | 3 - spacy/lang/da/norm_exceptions.py | 3 - spacy/lang/da/punctuation.py | 3 - spacy/lang/da/stop_words.py | 3 - spacy/lang/da/tokenizer_exceptions.py | 4 - spacy/lang/de/__init__.py | 3 - spacy/lang/de/examples.py | 3 - spacy/lang/de/norm_exceptions.py | 3 - spacy/lang/de/punctuation.py | 3 - spacy/lang/de/stop_words.py | 3 - spacy/lang/de/syntax_iterators.py | 3 - spacy/lang/de/tag_map.py | 3 - spacy/lang/de/tokenizer_exceptions.py | 3 - spacy/lang/el/__init__.py | 4 - spacy/lang/el/examples.py | 4 - spacy/lang/el/get_pos_from_wiktionary.py | 3 - spacy/lang/el/lemmatizer.py | 3 - spacy/lang/el/lex_attrs.py | 4 - spacy/lang/el/norm_exceptions.py | 3 - spacy/lang/el/punctuation.py | 4 - spacy/lang/el/stop_words.py | 3 - spacy/lang/el/syntax_iterators.py | 3 - spacy/lang/el/tag_map.py | 3 - spacy/lang/el/tag_map_general.py | 3 - spacy/lang/el/tokenizer_exceptions.py | 3 - spacy/lang/en/__init__.py | 3 - spacy/lang/en/examples.py | 3 - spacy/lang/en/lex_attrs.py | 3 - spacy/lang/en/morph_rules.py | 3 - spacy/lang/en/norm_exceptions.py | 3 - spacy/lang/en/stop_words.py | 3 - spacy/lang/en/syntax_iterators.py | 3 - spacy/lang/en/tag_map.py | 3 - spacy/lang/en/tokenizer_exceptions.py | 3 - spacy/lang/es/__init__.py | 3 - spacy/lang/es/examples.py | 3 - spacy/lang/es/lex_attrs.py | 3 - spacy/lang/es/stop_words.py | 3 - spacy/lang/es/syntax_iterators.py | 3 - spacy/lang/es/tag_map.py | 3 - spacy/lang/es/tokenizer_exceptions.py | 3 - spacy/lang/et/__init__.py | 3 - spacy/lang/et/stop_words.py | 3 - spacy/lang/fa/__init__.py | 3 - spacy/lang/fa/examples.py | 3 - spacy/lang/fa/generate_verbs_exc.py | 3 - spacy/lang/fa/lex_attrs.py | 2 - spacy/lang/fa/punctuation.py | 3 - spacy/lang/fa/stop_words.py | 3 - spacy/lang/fa/syntax_iterators.py | 3 - spacy/lang/fa/tag_map.py | 3 - spacy/lang/fa/tokenizer_exceptions.py | 3 - spacy/lang/fi/__init__.py | 3 - spacy/lang/fi/examples.py | 3 - spacy/lang/fi/lex_attrs.py | 3 - spacy/lang/fi/punctuation.py | 3 - spacy/lang/fi/stop_words.py | 3 - spacy/lang/fi/tokenizer_exceptions.py | 3 - spacy/lang/fr/__init__.py | 3 - spacy/lang/fr/_tokenizer_exceptions_list.py | 3 - spacy/lang/fr/examples.py | 3 - spacy/lang/fr/lemmatizer.py | 3 - spacy/lang/fr/lex_attrs.py | 3 - spacy/lang/fr/punctuation.py | 3 - spacy/lang/fr/stop_words.py | 3 - spacy/lang/fr/syntax_iterators.py | 3 - spacy/lang/fr/tag_map.py | 3 - spacy/lang/fr/tokenizer_exceptions.py | 7 +- spacy/lang/ga/__init__.py | 3 - spacy/lang/ga/irish_morphology_helpers.py | 3 - spacy/lang/ga/stop_words.py | 4 - spacy/lang/ga/tag_map.py | 3 - spacy/lang/ga/tokenizer_exceptions.py | 3 - spacy/lang/he/__init__.py | 3 - spacy/lang/he/examples.py | 3 - spacy/lang/he/stop_words.py | 4 - spacy/lang/hi/__init__.py | 3 - spacy/lang/hi/examples.py | 3 - spacy/lang/hi/lex_attrs.py | 3 - spacy/lang/hi/stop_words.py | 3 - spacy/lang/hr/__init__.py | 3 - spacy/lang/hr/examples.py | 3 - spacy/lang/hr/stop_words.py | 4 - spacy/lang/hu/__init__.py | 3 - spacy/lang/hu/examples.py | 3 - spacy/lang/hu/punctuation.py | 3 - spacy/lang/hu/stop_words.py | 3 - spacy/lang/hu/tokenizer_exceptions.py | 3 - spacy/lang/id/__init__.py | 3 - spacy/lang/id/_tokenizer_exceptions_list.py | 3 - spacy/lang/id/examples.py | 3 - spacy/lang/id/lex_attrs.py | 3 - spacy/lang/id/norm_exceptions.py | 3 - spacy/lang/id/punctuation.py | 3 - spacy/lang/id/stop_words.py | 3 - spacy/lang/id/syntax_iterators.py | 3 - spacy/lang/id/tag_map.py | 3 - spacy/lang/id/tokenizer_exceptions.py | 3 - spacy/lang/is/__init__.py | 3 - spacy/lang/is/stop_words.py | 3 - spacy/lang/it/__init__.py | 3 - spacy/lang/it/examples.py | 3 - spacy/lang/it/punctuation.py | 3 - spacy/lang/it/stop_words.py | 3 - spacy/lang/it/tag_map.py | 3 - spacy/lang/it/tokenizer_exceptions.py | 2 - spacy/lang/ja/__init__.py | 3 - spacy/lang/ja/examples.py | 3 - spacy/lang/ja/stop_words.py | 3 - spacy/lang/ja/tag_map.py | 3 - spacy/lang/kn/__init__.py | 3 - spacy/lang/kn/stop_words.py | 3 - spacy/lang/ko/__init__.py | 3 - spacy/lang/ko/examples.py | 3 - spacy/lang/ko/lex_attrs.py | 3 - spacy/lang/ko/stop_words.py | 3 - spacy/lang/ko/tag_map.py | 3 - spacy/lang/lb/__init__.py | 3 - spacy/lang/lb/examples.py | 3 - spacy/lang/lb/lex_attrs.py | 3 - spacy/lang/lb/norm_exceptions.py | 3 - spacy/lang/lb/punctuation.py | 3 - spacy/lang/lb/stop_words.py | 3 - spacy/lang/lb/tag_map.py | 3 - spacy/lang/lb/tokenizer_exceptions.py | 3 - spacy/lang/lex_attrs.py | 3 - spacy/lang/lt/__init__.py | 3 - spacy/lang/lt/examples.py | 3 - spacy/lang/lt/lex_attrs.py | 3 - spacy/lang/lt/morph_rules.py | 3 - spacy/lang/lt/stop_words.py | 3 - spacy/lang/lt/tag_map.py | 3 - spacy/lang/lt/tokenizer_exceptions.py | 3 - spacy/lang/lv/__init__.py | 3 - spacy/lang/lv/stop_words.py | 3 - spacy/lang/mr/__init__.py | 3 - spacy/lang/mr/stop_words.py | 3 - spacy/lang/nb/__init__.py | 3 - spacy/lang/nb/examples.py | 3 - spacy/lang/nb/morph_rules.py | 3 - spacy/lang/nb/punctuation.py | 3 - spacy/lang/nb/stop_words.py | 4 - spacy/lang/nb/syntax_iterators.py | 3 - spacy/lang/nb/tag_map.py | 3 - spacy/lang/nb/tokenizer_exceptions.py | 3 - spacy/lang/nl/__init__.py | 3 - spacy/lang/nl/examples.py | 3 - spacy/lang/nl/lemmatizer.py | 3 - spacy/lang/nl/lex_attrs.py | 3 - spacy/lang/nl/punctuation.py | 3 - spacy/lang/nl/stop_words.py | 3 - spacy/lang/nl/tag_map.py | 3 - spacy/lang/nl/tokenizer_exceptions.py | 3 - spacy/lang/norm_exceptions.py | 3 - spacy/lang/pl/__init__.py | 3 - spacy/lang/pl/_tokenizer_exceptions_list.py | 4 - spacy/lang/pl/examples.py | 3 - spacy/lang/pl/lex_attrs.py | 3 - spacy/lang/pl/punctuation.py | 3 - spacy/lang/pl/stop_words.py | 4 - spacy/lang/pl/tag_map.py | 3 - spacy/lang/pl/tokenizer_exceptions.py | 3 - spacy/lang/pt/__init__.py | 3 - spacy/lang/pt/examples.py | 3 - spacy/lang/pt/lex_attrs.py | 3 - spacy/lang/pt/norm_exceptions.py | 3 - spacy/lang/pt/punctuation.py | 3 - spacy/lang/pt/stop_words.py | 3 - spacy/lang/pt/tag_map.py | 3 - spacy/lang/pt/tokenizer_exceptions.py | 3 - spacy/lang/punctuation.py | 3 - spacy/lang/ro/__init__.py | 3 - spacy/lang/ro/examples.py | 3 - spacy/lang/ro/lex_attrs.py | 3 - spacy/lang/ro/stop_words.py | 4 - spacy/lang/ro/tag_map.py | 2 - spacy/lang/ro/tokenizer_exceptions.py | 3 - spacy/lang/ru/__init__.py | 3 - spacy/lang/ru/examples.py | 3 - spacy/lang/ru/lemmatizer.py | 6 +- spacy/lang/ru/lex_attrs.py | 3 - spacy/lang/ru/norm_exceptions.py | 3 - spacy/lang/ru/stop_words.py | 4 - spacy/lang/ru/tag_map.py | 3 - spacy/lang/ru/tokenizer_exceptions.py | 3 - spacy/lang/si/__init__.py | 3 - spacy/lang/si/examples.py | 3 - spacy/lang/si/lex_attrs.py | 3 - spacy/lang/si/stop_words.py | 3 - spacy/lang/sk/__init__.py | 3 - spacy/lang/sk/stop_words.py | 3 - spacy/lang/sl/__init__.py | 3 - spacy/lang/sl/stop_words.py | 3 - spacy/lang/sq/__init__.py | 3 - spacy/lang/sq/examples.py | 3 - spacy/lang/sq/stop_words.py | 3 - spacy/lang/sr/__init__.py | 3 - spacy/lang/sr/examples.py | 3 - spacy/lang/sr/lex_attrs.py | 3 - spacy/lang/sr/norm_exceptions.py | 3 - spacy/lang/sr/stop_words.py | 3 - spacy/lang/sr/tokenizer_exceptions.py | 3 - spacy/lang/sv/__init__.py | 3 - spacy/lang/sv/examples.py | 3 - spacy/lang/sv/morph_rules.py | 3 - spacy/lang/sv/stop_words.py | 3 - spacy/lang/sv/syntax_iterators.py | 3 - spacy/lang/sv/tag_map.py | 3 - spacy/lang/sv/tokenizer_exceptions.py | 3 - spacy/lang/ta/__init__.py | 3 - spacy/lang/ta/examples.py | 3 - spacy/lang/ta/lex_attrs.py | 3 - spacy/lang/ta/norm_exceptions.py | 3 - spacy/lang/ta/stop_words.py | 3 - spacy/lang/tag_map.py | 3 - spacy/lang/te/__init__.py | 3 - spacy/lang/te/examples.py | 3 - spacy/lang/te/lex_attrs.py | 3 - spacy/lang/te/stop_words.py | 3 - spacy/lang/th/__init__.py | 3 - spacy/lang/th/lex_attrs.py | 3 - spacy/lang/th/norm_exceptions.py | 3 - spacy/lang/th/tag_map.py | 3 - spacy/lang/th/tokenizer_exceptions.py | 3 - spacy/lang/tl/__init__.py | 3 - spacy/lang/tl/lex_attrs.py | 3 - spacy/lang/tl/stop_words.py | 3 - spacy/lang/tl/tokenizer_exceptions.py | 3 - spacy/lang/tokenizer_exceptions.py | 3 - spacy/lang/tr/__init__.py | 3 - spacy/lang/tr/examples.py | 3 - spacy/lang/tr/lex_attrs.py | 3 - spacy/lang/tr/stop_words.py | 4 - spacy/lang/tr/tokenizer_exceptions.py | 3 - spacy/lang/tt/__init__.py | 3 - spacy/lang/tt/examples.py | 3 - spacy/lang/tt/lex_attrs.py | 3 - spacy/lang/tt/punctuation.py | 3 - spacy/lang/tt/stop_words.py | 3 - spacy/lang/tt/tokenizer_exceptions.py | 3 - spacy/lang/uk/__init__.py | 3 - spacy/lang/uk/examples.py | 3 - spacy/lang/uk/lemmatizer.py | 1 - spacy/lang/uk/lex_attrs.py | 3 - spacy/lang/uk/stop_words.py | 4 - spacy/lang/uk/tag_map.py | 3 - spacy/lang/uk/tokenizer_exceptions.py | 3 - spacy/lang/ur/__init__.py | 3 - spacy/lang/ur/examples.py | 3 - spacy/lang/ur/lex_attrs.py | 3 - spacy/lang/ur/punctuation.py | 3 - spacy/lang/ur/stop_words.py | 3 - spacy/lang/ur/tag_map.py | 3 - spacy/lang/vi/__init__.py | 3 - spacy/lang/vi/lex_attrs.py | 3 - spacy/lang/vi/stop_words.py | 3 - spacy/lang/vi/tag_map.py | 3 - spacy/lang/xx/__init__.py | 3 - spacy/lang/xx/examples.py | 3 - spacy/lang/yo/__init__.py | 3 - spacy/lang/yo/examples.py | 3 - spacy/lang/yo/lex_attrs.py | 3 - spacy/lang/yo/stop_words.py | 3 - spacy/lang/zh/__init__.py | 3 - spacy/lang/zh/examples.py | 3 - spacy/lang/zh/lex_attrs.py | 4 +- spacy/lang/zh/stop_words.py | 4 - spacy/lang/zh/tag_map.py | 3 - spacy/language.py | 38 +-- spacy/lemmatizer.py | 7 +- spacy/lexeme.pyx | 3 - spacy/lookups.py | 21 +- spacy/matcher/__init__.py | 3 - spacy/matcher/_schemas.py | 3 - spacy/matcher/dependencymatcher.pyx | 2 - spacy/matcher/matcher.pyx | 2 - spacy/matcher/phrasematcher.pyx | 2 - spacy/ml/__init__.py | 3 - spacy/ml/_legacy_tok2vec.py | 2 - spacy/ml/_wire.py | 1 - spacy/ml/common.py | 2 - spacy/ml/tok2vec.py | 2 - spacy/morphology.pyx | 6 +- spacy/parts_of_speech.pyx | 3 - spacy/pipeline/__init__.py | 3 - spacy/pipeline/entityruler.py | 27 +- spacy/pipeline/functions.py | 3 - spacy/pipeline/hooks.py | 3 - spacy/pipeline/morphologizer.pyx | 6 +- spacy/pipeline/pipes.pyx | 101 +++---- spacy/scorer.py | 3 - spacy/strings.pyx | 8 +- spacy/symbols.pyx | 6 +- spacy/syntax/_parser_model.pyx | 8 +- spacy/syntax/arc_eager.pyx | 7 +- spacy/syntax/ner.pyx | 5 +- spacy/syntax/nn_parser.pyx | 28 +- spacy/syntax/nonproj.pyx | 3 - spacy/syntax/stateclass.pyx | 3 - spacy/syntax/transition_system.pyx | 5 +- spacy/tests/conftest.py | 3 - spacy/tests/doc/test_add_entities.py | 3 - spacy/tests/doc/test_array.py | 3 - spacy/tests/doc/test_creation.py | 3 - spacy/tests/doc/test_doc_api.py | 3 - spacy/tests/doc/test_morphanalysis.py | 3 - spacy/tests/doc/test_pickle_doc.py | 9 +- spacy/tests/doc/test_retokenize_merge.py | 3 - spacy/tests/doc/test_retokenize_split.py | 3 - spacy/tests/doc/test_span.py | 3 - spacy/tests/doc/test_to_json.py | 3 - spacy/tests/doc/test_token_api.py | 3 - spacy/tests/doc/test_underscore.py | 3 - spacy/tests/lang/ar/test_exceptions.py | 3 - spacy/tests/lang/ar/test_text.py | 3 - spacy/tests/lang/bn/test_tokenizer.py | 3 - spacy/tests/lang/ca/test_exception.py | 4 - .../tests/lang/ca/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/ca/test_text.py | 6 - spacy/tests/lang/da/test_exceptions.py | 3 - .../tests/lang/da/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/da/test_text.py | 3 - spacy/tests/lang/de/test_exceptions.py | 3 - spacy/tests/lang/de/test_parser.py | 3 - .../tests/lang/de/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/de/test_text.py | 3 - spacy/tests/lang/el/test_exception.py | 3 - spacy/tests/lang/el/test_text.py | 3 - .../lang/en/test_customized_tokenizer.py | 3 - spacy/tests/lang/en/test_exceptions.py | 3 - spacy/tests/lang/en/test_indices.py | 3 - spacy/tests/lang/en/test_noun_chunks.py | 3 - spacy/tests/lang/en/test_parser.py | 3 - .../tests/lang/en/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/en/test_punct.py | 3 - spacy/tests/lang/en/test_sbd.py | 3 - spacy/tests/lang/en/test_tagger.py | 3 - spacy/tests/lang/en/test_text.py | 3 - spacy/tests/lang/es/test_exception.py | 3 - spacy/tests/lang/es/test_text.py | 3 - spacy/tests/lang/fi/test_text.py | 3 - spacy/tests/lang/fi/test_tokenizer.py | 3 - spacy/tests/lang/fr/test_exceptions.py | 3 - .../tests/lang/fr/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/fr/test_text.py | 3 - spacy/tests/lang/ga/test_tokenizer.py | 3 - spacy/tests/lang/he/test_tokenizer.py | 3 - spacy/tests/lang/hu/test_tokenizer.py | 3 - .../tests/lang/id/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/id/test_text.py | 3 - .../tests/lang/it/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/ja/test_lemmatization.py | 3 - spacy/tests/lang/ja/test_tokenizer.py | 3 - spacy/tests/lang/ko/test_lemmatization.py | 3 - spacy/tests/lang/ko/test_tokenizer.py | 3 - spacy/tests/lang/lb/test_exceptions.py | 3 - .../tests/lang/lb/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/lb/test_text.py | 3 - spacy/tests/lang/lt/test_text.py | 3 - spacy/tests/lang/nb/test_tokenizer.py | 3 - spacy/tests/lang/nl/test_text.py | 3 - spacy/tests/lang/pl/test_text.py | 5 - spacy/tests/lang/pl/test_tokenizer.py | 3 - spacy/tests/lang/pt/test_text.py | 3 - spacy/tests/lang/ro/test_tokenizer.py | 3 - spacy/tests/lang/ru/test_exceptions.py | 3 - spacy/tests/lang/ru/test_lemmatizer.py | 3 - spacy/tests/lang/ru/test_text.py | 3 - spacy/tests/lang/ru/test_tokenizer.py | 3 - spacy/tests/lang/sr/test_exceptions.py | 3 - spacy/tests/lang/sr/test_tokenizer.py | 3 - spacy/tests/lang/sv/test_exceptions.py | 3 - spacy/tests/lang/sv/test_noun_chunks.py | 3 - .../tests/lang/sv/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/sv/test_text.py | 3 - spacy/tests/lang/sv/test_tokenizer.py | 3 - spacy/tests/lang/test_attrs.py | 3 - spacy/tests/lang/test_initialize.py | 3 - spacy/tests/lang/th/test_tokenizer.py | 3 - spacy/tests/lang/tt/test_tokenizer.py | 3 - spacy/tests/lang/uk/test_tokenizer.py | 3 - spacy/tests/lang/uk/test_tokenizer_exc.py | 3 - .../tests/lang/ur/test_prefix_suffix_infix.py | 3 - spacy/tests/lang/ur/test_text.py | 3 - spacy/tests/lang/yo/test_text.py | 3 - spacy/tests/lang/zh/test_text.py | 3 - spacy/tests/lang/zh/test_tokenizer.py | 3 - spacy/tests/matcher/test_matcher_api.py | 3 - spacy/tests/matcher/test_matcher_logic.py | 3 - .../tests/matcher/test_pattern_validation.py | 3 - spacy/tests/matcher/test_phrase_matcher.py | 3 - spacy/tests/morphology/test_morph_features.py | 3 - spacy/tests/parser/test_add_label.py | 3 - spacy/tests/parser/test_arc_eager_oracle.py | 3 - spacy/tests/parser/test_ner.py | 3 - spacy/tests/parser/test_neural_parser.py | 3 - spacy/tests/parser/test_nn_beam.py | 3 - spacy/tests/parser/test_nonproj.py | 3 - spacy/tests/parser/test_parse.py | 3 - spacy/tests/parser/test_parse_navigate.py | 3 - spacy/tests/parser/test_preset_sbd.py | 3 - spacy/tests/parser/test_space_attachment.py | 3 - spacy/tests/pipeline/test_analysis.py | 20 +- spacy/tests/pipeline/test_entity_linker.py | 3 - spacy/tests/pipeline/test_entity_ruler.py | 3 - spacy/tests/pipeline/test_factories.py | 3 - spacy/tests/pipeline/test_functions.py | 3 - spacy/tests/pipeline/test_pipe_methods.py | 3 - spacy/tests/pipeline/test_sentencizer.py | 3 - spacy/tests/pipeline/test_tagger.py | 3 - spacy/tests/pipeline/test_textcat.py | 3 - spacy/tests/regression/test_issue1-1000.py | 3 - spacy/tests/regression/test_issue1001-1500.py | 3 - spacy/tests/regression/test_issue1501-2000.py | 3 - spacy/tests/regression/test_issue2001-2500.py | 3 - spacy/tests/regression/test_issue2501-3000.py | 3 - spacy/tests/regression/test_issue3001-3500.py | 73 +---- spacy/tests/regression/test_issue3521.py | 3 - spacy/tests/regression/test_issue3526.py | 3 - spacy/tests/regression/test_issue3531.py | 3 - spacy/tests/regression/test_issue3540.py | 3 - spacy/tests/regression/test_issue3549.py | 3 - spacy/tests/regression/test_issue3555.py | 3 - spacy/tests/regression/test_issue3611.py | 3 - spacy/tests/regression/test_issue3625.py | 3 - spacy/tests/regression/test_issue3803.py | 3 - spacy/tests/regression/test_issue3839.py | 3 - spacy/tests/regression/test_issue3869.py | 3 - spacy/tests/regression/test_issue3879.py | 3 - spacy/tests/regression/test_issue3880.py | 3 - spacy/tests/regression/test_issue3882.py | 3 - spacy/tests/regression/test_issue3951.py | 3 - spacy/tests/regression/test_issue3959.py | 3 - spacy/tests/regression/test_issue3962.py | 3 - spacy/tests/regression/test_issue3972.py | 3 - spacy/tests/regression/test_issue4002.py | 3 - spacy/tests/regression/test_issue4030.py | 3 - spacy/tests/regression/test_issue4042.py | 3 - spacy/tests/regression/test_issue4054.py | 3 - spacy/tests/regression/test_issue4120.py | 3 - spacy/tests/regression/test_issue4133.py | 3 - spacy/tests/regression/test_issue4190.py | 3 - spacy/tests/regression/test_issue4267.py | 3 - spacy/tests/regression/test_issue4272.py | 3 - spacy/tests/regression/test_issue4278.py | 3 - spacy/tests/regression/test_issue4313.py | 3 - spacy/tests/regression/test_issue4348.py | 3 - spacy/tests/regression/test_issue4367.py | 3 - spacy/tests/regression/test_issue4373.py | 3 - spacy/tests/regression/test_issue4402.py | 3 - spacy/tests/regression/test_issue4528.py | 3 - spacy/tests/regression/test_issue4529.py | 3 - spacy/tests/regression/test_issue4590.py | 3 - spacy/tests/regression/test_issue4651.py | 3 - spacy/tests/regression/test_issue4674.py | 3 - spacy/tests/regression/test_issue4707.py | 3 - spacy/tests/serialize/test_serialize_doc.py | 8 +- .../test_serialize_extension_attrs.py | 9 +- spacy/tests/serialize/test_serialize_kb.py | 3 - .../serialize/test_serialize_language.py | 3 - .../serialize/test_serialize_pipeline.py | 3 - .../serialize/test_serialize_tokenizer.py | 3 - .../serialize/test_serialize_vocab_strings.py | 3 - spacy/tests/test_architectures.py | 3 - spacy/tests/test_cli.py | 3 - spacy/tests/test_displacy.py | 7 +- spacy/tests/test_gold.py | 62 ++-- spacy/tests/test_json_schemas.py | 3 - spacy/tests/test_language.py | 8 - spacy/tests/test_lemmatizer.py | 3 - spacy/tests/test_misc.py | 9 +- spacy/tests/test_pickles.py | 3 - spacy/tests/test_scorer.py | 3 - spacy/tests/test_tok2vec.py | 6 +- spacy/tests/tokenizer/test_exceptions.py | 3 - spacy/tests/tokenizer/test_explain.py | 5 +- spacy/tests/tokenizer/test_naughty_strings.py | 3 - spacy/tests/tokenizer/test_tokenizer.py | 3 - spacy/tests/tokenizer/test_urls.py | 3 - spacy/tests/tokenizer/test_whitespace.py | 3 - spacy/tests/util.py | 6 +- spacy/tests/vocab_vectors/test_lexeme.py | 3 - spacy/tests/vocab_vectors/test_lookups.py | 3 - spacy/tests/vocab_vectors/test_similarity.py | 3 - spacy/tests/vocab_vectors/test_stringstore.py | 3 - spacy/tests/vocab_vectors/test_vectors.py | 3 - spacy/tests/vocab_vectors/test_vocab_api.py | 3 - spacy/tokenizer.pyx | 44 ++- spacy/tokens/__init__.py | 3 - spacy/tokens/_retokenize.pyx | 3 - spacy/tokens/_serialize.py | 3 - spacy/tokens/doc.pyx | 14 +- spacy/tokens/span.pyx | 14 +- spacy/tokens/token.pyx | 8 +- spacy/tokens/underscore.py | 3 - spacy/util.py | 42 ++- spacy/vectors.pyx | 47 ++- spacy/vocab.pyx | 43 ++- website/docs/api/top-level.md | 47 --- website/docs/usage/index.md | 15 +- website/docs/usage/processing-pipelines.md | 4 +- website/docs/usage/spacy-101.md | 13 +- 572 files changed, 526 insertions(+), 2625 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e3ce53024..000000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: python -sudo: false -cache: pip -dist: trusty -group: edge -python: - - "2.7" -os: - - linux -install: - - "pip install -r requirements.txt" - - "python setup.py build_ext --inplace" - - "pip install -e ." -script: - - "cat /proc/cpuinfo | grep flags | head -n 1" - - "python -m pytest --tb=native spacy" -branches: - except: - - spacy.io -notifications: - slack: - secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ= - email: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3c2b56cd3..6b7881dd2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -280,23 +280,7 @@ except: # noqa: E722 ### Python conventions -All Python code must be written in an **intersection of Python 2 and Python 3**. -This is easy in Cython, but somewhat ugly in Python. Logic that deals with -Python or platform compatibility should only live in -[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin -functions, replacement functions are suffixed with an underscore, for example -`unicode_`. If you need to access the user's version or platform information, -for example to show more specific error messages, you can use the `is_config()` -helper function. - -```python -from .compat import unicode_, is_config - -compatible_unicode = unicode_('hello world') -if is_config(windows=True, python2=True): - print("You are using Python 2 on Windows.") -``` - +All Python code must be written **compatible with Python 3.6+**. Code that interacts with the file-system should accept objects that follow the `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`. If the function is user-facing and takes a path as an argument, it should check diff --git a/README.md b/README.md index 74d2d2166..500431b9f 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license. [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines]()](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) -[![Travis Build Status]()](https://travis-ci.org/explosion/spaCy) [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases) [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy) @@ -98,7 +97,7 @@ For detailed installation instructions, see the - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio) -- **Python version**: Python 2.7, 3.5+ (only 64 bit) +- **Python version**: Python 3.6+ (only 64 bit) - **Package managers**: [pip] · [conda] (via `conda-forge`) [pip]: https://pypi.org/project/spacy/ @@ -269,9 +268,7 @@ and git preinstalled. Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that -matches the version that was used to compile your Python interpreter. For -official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and -VS 2015 (Python 3.5). +matches the version that was used to compile your Python interpreter. ## Run tests diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 054365336..d34da39f7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,12 +35,6 @@ jobs: dependsOn: 'Validate' strategy: matrix: - Python35Linux: - imageName: 'ubuntu-16.04' - python.version: '3.5' - Python35Windows: - imageName: 'vs2017-win2016' - python.version: '3.5' Python36Linux: imageName: 'ubuntu-16.04' python.version: '3.6' diff --git a/bin/cythonize.py b/bin/cythonize.py index 4814f8df0..554252294 100755 --- a/bin/cythonize.py +++ b/bin/cythonize.py @@ -38,14 +38,14 @@ import argparse HASH_FILE = "cythonize.json" -def process_pyx(fromfile, tofile, language_level="-2"): +def process_pyx(fromfile, tofile, language_level="-3"): print("Processing %s" % fromfile) try: from Cython.Compiler.Version import version as cython_version from distutils.version import LooseVersion - if LooseVersion(cython_version) < LooseVersion("0.19"): - raise Exception("Require Cython >= 0.19") + if LooseVersion(cython_version) < LooseVersion("0.25"): + raise Exception("Require Cython >= 0.25") except ImportError: pass diff --git a/fabfile.py b/fabfile.py index fcab493f5..460471747 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, print_function - import contextlib from pathlib import Path from fabric.api import local, lcd, env, settings, prefix diff --git a/requirements.txt b/requirements.txt index 1786ee186..188459c67 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ catalogue>=0.0.7,<1.1.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 -pathlib==1.0.1; python_version < "3.4" tqdm>=4.38.0,<5.0.0 # Optional dependencies jsonschema>=2.6.0,<3.1.0 diff --git a/setup.cfg b/setup.cfg index a0103c5a2..28259c989 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,10 +16,7 @@ classifiers = Operating System :: MacOS :: MacOS X Operating System :: Microsoft :: Windows Programming Language :: Cython - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 - Programming Language :: Python :: 3.5 Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 @@ -30,7 +27,7 @@ zip_safe = false include_package_data = true scripts = bin/spacy -python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +python_requires = >=3.6 setup_requires = wheel cython>=0.25 @@ -54,7 +51,6 @@ install_requires = numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - pathlib==1.0.1; python_version < "3.4" [options.extras_require] lookups = diff --git a/setup.py b/setup.py index 62a09aa73..1afdc7ae4 100755 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import io import os import subprocess diff --git a/spacy/__init__.py b/spacy/__init__.py index 4a0d16a49..49db0e3b5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals import warnings import sys diff --git a/spacy/__main__.py b/spacy/__main__.py index 2c285095e..06ba5704d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,9 +1,3 @@ -# coding: utf8 -from __future__ import print_function - -# NB! This breaks in plac on Python 2!! -# from __future__ import unicode_literals - if __name__ == "__main__": import plac import sys @@ -32,5 +26,5 @@ if __name__ == "__main__": if command in commands: plac.call(commands[command], sys.argv[1:]) else: - available = "Available: {}".format(", ".join(commands)) - msg.fail("Unknown command: {}".format(command), available, exits=1) + available = f"Available: {', '.join(commands)}" + msg.fail(f"Unknown command: {command}", available, exits=1) diff --git a/spacy/_ml.py b/spacy/_ml.py index 8695a88cc..a1d2b6b77 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu from thinc.t2t import ExtractWindow, ParametricAttention diff --git a/spacy/analysis.py b/spacy/analysis.py index 761be3de9..ed6d6b18e 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import OrderedDict from wasabi import Printer from .tokens import Doc, Token, Span @@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): assert pipeline[index][0] == name prev_pipes = pipeline[:index] pipe_requires = getattr(pipe, "requires", []) - requires = OrderedDict([(annot, False) for annot in pipe_requires]) + requires = {annot: False for annot in pipe_requires} if requires: for prev_name, prev_pipe in prev_pipes: prev_assigns = getattr(prev_pipe, "assigns", []) @@ -98,15 +94,15 @@ def validate_attrs(values): for ext_attr, ext_value in value.items(): # We don't check whether the attribute actually exists if ext_value is not True: # attr is something like doc._.x.y - good = "{}._.{}".format(obj_key, ext_attr) - bad = "{}.{}".format(good, ".".join(ext_value)) + good = f"{obj_key}._.{ext_attr}" + bad = f"{good}.{'.'.join(ext_value)}" raise ValueError(Errors.E183.format(attr=bad, solution=good)) continue # we can't validate those further if attr.endswith("_"): # attr is something like "token.pos_" raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1])) if value is not True: # attr is something like doc.x.y - good = "{}.{}".format(obj_key, attr) - bad = "{}.{}".format(good, ".".join(value)) + good = f"{obj_key}.{attr}" + bad = f"{good}.{'.'.join(value)}" raise ValueError(Errors.E183.format(attr=bad, solution=good)) obj = objs[obj_key] if not hasattr(obj, attr): @@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False): msg.table(overview, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in problems.values()) if any(p for p in problems.values()): - msg.divider("Problems ({})".format(n_problems)) + msg.divider(f"Problems ({n_problems})") for name, problem in problems.items(): if problem: - problem = ", ".join(problem) - msg.warn("'{}' requirements not met: {}".format(name, problem)) + msg.warn(f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.") if no_print: diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 6d1c18eb9..a601a7a66 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - IDS = { "": NULL_ATTR, diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py index 3fb2c8979..42e5e04dd 100644 --- a/spacy/cli/_schemas.py +++ b/spacy/cli/_schemas.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - # NB: This schema describes the new format of the training data, see #2928 TRAINING_SCHEMA = { diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 0cc0693a8..d8c8a7a18 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac from pathlib import Path from wasabi import Printer @@ -30,16 +27,18 @@ FILE_TYPES_STDOUT = ("json", "jsonl") @plac.annotations( + # fmt: off input_file=("Input file", "positional", None, str), output_dir=("Output directory. '-' for stdout.", "positional", None, str), - file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str), + file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES), n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int), seg_sents=("Segment sentences (for -c ner)", "flag", "s"), model=("Model for sentence segmentation (for -s)", "option", "b", str), - converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), + converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool), - ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path), + ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,), + # fmt: on ) def convert( input_file, @@ -62,16 +61,10 @@ def convert( no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) - if file_type not in FILE_TYPES: - msg.fail( - "Unknown file type: '{}'".format(file_type), - "Supported file types: '{}'".format(", ".join(FILE_TYPES)), - exits=1, - ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( - "Can't write .{} data to stdout.".format(file_type), + f"Can't write .{file_type} data to stdout", "Please specify an output directory.", exits=1, ) @@ -95,7 +88,7 @@ def convert( "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: - msg.fail("Can't find converter for {}".format(converter), exits=1) + msg.fail(f"Can't find converter for {converter}", exits=1) ner_map = None if ner_map_path is not None: ner_map = srsly.read_json(ner_map_path) @@ -113,7 +106,7 @@ def convert( ) if output_dir != "-": # Export data to a file - suffix = ".{}".format(file_type) + suffix = f".{file_type}" output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) @@ -121,9 +114,7 @@ def convert( srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) - msg.good( - "Generated output file ({} documents): {}".format(len(data), output_file) - ) + msg.good(f"Generated output file ({len(data)} documents): {output_file}") else: # Print to stdout if file_type == "json": diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index 46489ad7c..b607d5913 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from wasabi import Printer from ...gold import iob_to_biluo @@ -64,9 +61,9 @@ def conll_ner2json( # sentence segmentation required for document segmentation if n_sents > 0 and not seg_sents: msg.warn( - "No sentence boundaries found to use with option `-n {}`. " - "Use `-s` to automatically segment sentences or `-n 0` " - "to disable.".format(n_sents) + f"No sentence boundaries found to use with option `-n {n_sents}`. " + f"Use `-s` to automatically segment sentences or `-n 0` " + f"to disable." ) else: n_sents_info(msg, n_sents) @@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): if model: nlp = load_model(model) if "parser" in nlp.pipe_names: - msg.info("Segmenting sentences with parser from model '{}'.".format(model)) + msg.info(f"Segmenting sentences with parser from model '{model}'.") sentencizer = nlp.get_pipe("parser") if not sentencizer: msg.info( @@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter): def n_sents_info(msg, n_sents): - msg.info("Grouping every {} sentences into a document.".format(n_sents)) + msg.info(f"Grouping every {n_sents} sentences into a document.") if n_sents == 1: msg.warn( "To generate better training data, you may want to group " diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 3febd07d1..12b1103d4 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from spacy.gold import Example diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 61c398f8d..b6ac234fc 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from wasabi import Printer from ...gold import iob_to_biluo diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 1c1bc45c7..525063b22 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import srsly from ...gold import docs_to_json diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c2af5bff0..2e780f53c 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - from pathlib import Path from collections import Counter import plac @@ -23,20 +20,17 @@ BLANK_MODEL_THRESHOLD = 2000 @plac.annotations( + # fmt: off lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), + pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), + # fmt: on ) def debug_data( lang, @@ -93,15 +87,11 @@ def debug_data( corpus.train_dataset_without_preprocessing(nlp) ) except ValueError as e: - loading_train_error_message = "Training data cannot be loaded: {}".format( - str(e) - ) + loading_train_error_message = f"Training data cannot be loaded: {e}" try: dev_dataset = list(corpus.dev_dataset(nlp)) except ValueError as e: - loading_dev_error_message = "Development data cannot be loaded: {}".format( - str(e) - ) + loading_dev_error_message = f"Development data cannot be loaded: {e}" if loading_train_error_message or loading_dev_error_message: if loading_train_error_message: msg.fail(loading_train_error_message) @@ -112,78 +102,66 @@ def debug_data( # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, pipeline) - gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline) + gold_train_unpreprocessed_data = _compile_gold( + train_dataset_unpreprocessed, pipeline + ) gold_dev_data = _compile_gold(dev_dataset, pipeline) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] msg.divider("Training stats") - msg.text("Training pipeline: {}".format(", ".join(pipeline))) + msg.text(f"Training pipeline: {', '.join(pipeline)}") for pipe in [p for p in pipeline if p not in nlp.factories]: - msg.fail("Pipeline component '{}' not available in factories".format(pipe)) + msg.fail(f"Pipeline component '{pipe}' not available in factories") if base_model: - msg.text("Starting with base model '{}'".format(base_model)) + msg.text(f"Starting with base model '{base_model}'") else: - msg.text("Starting with blank model '{}'".format(lang)) - msg.text("{} training docs".format(len(train_dataset))) - msg.text("{} evaluation docs".format(len(gold_dev_data))) + msg.text(f"Starting with blank model '{lang}'") + msg.text(f"{len(train_dataset)} training docs") + msg.text(f"{len(gold_dev_data)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: - msg.warn("{} training examples also in evaluation data".format(overlap)) + msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD: - text = "Low number of examples to train from a blank model ({})".format( - len(train_dataset) + text = ( + f"Low number of examples to train from a blank model ({len(train_dataset)})" ) if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( - "It's recommended to use at least {} examples (minimum {})".format( - BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD - ), + f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " + f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( - "{} total {} in the data ({} unique)".format( - n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]) - ) + f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: - msg.warn( - "{} misaligned tokens in the training data".format( - gold_train_data["n_misaligned_words"] - ) - ) + n_misaligned = gold_train_data["n_misaligned_words"] + msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: - msg.warn( - "{} misaligned tokens in the dev data".format( - gold_dev_data["n_misaligned_words"] - ) - ) + n_misaligned = gold_dev_data["n_misaligned_words"] + msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( - "10 most common words: {}".format( - _format_labels(most_common_words, counts=True) - ), + f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( - "{} vectors ({} unique keys, {} dimensions)".format( - len(nlp.vocab.vectors), - nlp.vocab.vectors.n_keys, - nlp.vocab.vectors_length, - ) + f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " + f"unique keys, {nlp.vocab.vectors_length} dimensions)" ) else: msg.info("No word vectors present in the model") @@ -203,19 +181,10 @@ def debug_data( msg.divider("Named Entity Recognition") msg.info( - "{} new {}, {} existing {}".format( - len(new_labels), - "label" if len(new_labels) == 1 else "labels", - len(existing_labels), - "label" if len(existing_labels) == 1 else "labels", - ) + f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)" ) missing_values = label_counts["-"] - msg.text( - "{} missing {} (tokens with '-' label)".format( - missing_values, "value" if missing_values == 1 else "values" - ) - ) + msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in new_labels: if len(label) == 0: msg.fail("Empty label found in new labels") @@ -226,33 +195,24 @@ def debug_data( if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) - msg.text("New: {}".format(labels_with_counts), show=verbose) + msg.text(f"New: {labels_with_counts}", show=verbose) if existing_labels: - msg.text( - "Existing: {}".format(_format_labels(existing_labels)), show=verbose - ) - + msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if gold_train_data["ws_ents"]: - msg.fail( - "{} invalid whitespace entity spans".format(gold_train_data["ws_ents"]) - ) + msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") has_ws_ents_error = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( - "Low number of examples for new label '{}' ({})".format( - label, label_counts[label] - ) + f"Low number of examples for new label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_dataset, label) if neg_docs == 0: - msg.warn( - "No examples for texts WITHOUT new label '{}'".format(label) - ) + msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if not has_low_data_warning: @@ -264,8 +224,8 @@ def debug_data( if has_low_data_warning: msg.text( - "To train a new entity type, your data should include at " - "least {} instances of the new label".format(NEW_LABEL_THRESHOLD), + f"To train a new entity type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: @@ -288,27 +248,21 @@ def debug_data( new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info( - "Text Classification: {} new label(s), {} existing label(s)".format( - len(new_labels), len(existing_labels) - ) + f"Text Classification: {len(new_labels)} new label(s), " + f"{len(existing_labels)} existing label(s)" ) if new_labels: labels_with_counts = _format_labels( gold_train_data["cats"].most_common(), counts=True ) - msg.text("New: {}".format(labels_with_counts), show=verbose) + msg.text(f"New: {labels_with_counts}", show=verbose) if existing_labels: - msg.text( - "Existing: {}".format(_format_labels(existing_labels)), show=verbose - ) + msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.fail( - "The train and dev labels are not the same. " - "Train labels: {}. " - "Dev labels: {}.".format( - _format_labels(gold_train_data["cats"]), - _format_labels(gold_dev_data["cats"]), - ) + f"The train and dev labels are not the same. " + f"Train labels: {_format_labels(gold_train_data['cats'])}. " + f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if gold_train_data["n_cats_multilabel"] > 0: msg.info( @@ -338,27 +292,16 @@ def debug_data( msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] tag_map = nlp.vocab.morphology.tag_map - msg.info( - "{} {} in data ({} {} in tag map)".format( - len(labels), - "label" if len(labels) == 1 else "labels", - len(tag_map), - "label" if len(tag_map) == 1 else "labels", - ) - ) + msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: - msg.good("All labels present in tag map for language '{}'".format(nlp.lang)) + msg.good(f"All labels present in tag map for language '{nlp.lang}'") for label in non_tagmap: - msg.fail( - "Label '{}' not found in tag map for language '{}'".format( - label, nlp.lang - ) - ) + msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'") if "parser" in pipeline: has_low_data_warning = False @@ -366,21 +309,18 @@ def debug_data( # profile sentence length msg.info( - "Found {} sentence{} with an average length of {:.1f} words.".format( - gold_train_data["n_sents"], - "s" if len(train_dataset) > 1 else "", - gold_train_data["n_words"] / gold_train_data["n_sents"], - ) + f"Found {gold_train_data['n_sents']} sentence(s) with an average " + f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( - "The training data contains {:.2f} sentences per " - "document. When there are very few documents containing more " - "than one sentence, the parser will not learn how to segment " - "longer texts into sentences.".format(sents_per_doc) + f"The training data contains {sents_per_doc:.2f} sentences per " + f"document. When there are very few documents containing more " + f"than one sentence, the parser will not learn how to segment " + f"longer texts into sentences." ) # profile labels @@ -391,32 +331,13 @@ def debug_data( labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: - msg.info( - "Found {} nonprojective train sentence{}".format( - gold_train_unpreprocessed_data["n_nonproj"], - "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "", - ) - ) + n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] + msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: - msg.info( - "Found {} nonprojective dev sentence{}".format( - gold_dev_data["n_nonproj"], - "s" if gold_dev_data["n_nonproj"] > 1 else "", - ) - ) - - msg.info( - "{} {} in train data".format( - len(labels_train_unpreprocessed), - "label" if len(labels_train) == 1 else "labels", - ) - ) - msg.info( - "{} {} in projectivized train data".format( - len(labels_train), "label" if len(labels_train) == 1 else "labels" - ) - ) - + n_nonproj = gold_dev_data["n_nonproj"] + msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") + msg.info(f"{labels_train_unpreprocessed} label(s) in train data") + msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True ) @@ -426,9 +347,8 @@ def debug_data( for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD: msg.warn( - "Low number of examples for label '{}' ({})".format( - label, gold_train_unpreprocessed_data["deps"][label] - ) + f"Low number of examples for label '{label}' " + f"({gold_train_unpreprocessed_data['deps'][label]})" ) has_low_data_warning = True @@ -437,22 +357,19 @@ def debug_data( for label in gold_train_data["deps"]: if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label: rare_projectivized_labels.append( - "{}: {}".format(label, str(gold_train_data["deps"][label])) + f"{label}: {gold_train_data['deps'][label]}" ) if len(rare_projectivized_labels) > 0: msg.warn( - "Low number of examples for {} label{} in the " - "projectivized dependency trees used for training. You may " - "want to projectivize labels such as punct before " - "training in order to improve parser performance.".format( - len(rare_projectivized_labels), - "s" if len(rare_projectivized_labels) > 1 else "", - ) + f"Low number of examples for {len(rare_projectivized_labels)} " + "label(s) in the projectivized dependency trees used for " + "training. You may want to projectivize labels such as punct " + "before training in order to improve parser performance." ) msg.warn( - "Projectivized labels with low numbers of examples: " - "{}".format("\n".join(rare_projectivized_labels)), + f"Projectivized labels with low numbers of examples: ", + ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True @@ -460,50 +377,44 @@ def debug_data( # labels only in train if set(labels_train) - set(labels_dev): msg.warn( - "The following labels were found only in the train data: " - "{}".format(", ".join(set(labels_train) - set(labels_dev))), + "The following labels were found only in the train data:", + ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( - "The following labels were found only in the dev data: " - + ", ".join(set(labels_dev) - set(labels_train)), + "The following labels were found only in the dev data:", + ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( - "To train a parser, your data should include at " - "least {} instances of each label.".format(DEP_LABEL_THRESHOLD), + f"To train a parser, your data should include at " + f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( - "Multiple root labels ({}) ".format( - ", ".join(gold_train_unpreprocessed_data["roots"]) - ) - + "found in training data. spaCy's parser uses a single root " - "label ROOT so this distinction will not be available." + f"Multiple root labels " + f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " + f"found in training data. spaCy's parser uses a single root " + f"label ROOT so this distinction will not be available." ) # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( - "Found {} nonprojective projectivized train sentence{}".format( - gold_train_data["n_nonproj"], - "s" if gold_train_data["n_nonproj"] > 1 else "", - ) + f"Found {gold_train_data['n_nonproj']} nonprojective " + f"projectivized train sentence(s)" ) if gold_train_data["n_cycles"] > 0: msg.fail( - "Found {} projectivized train sentence{} with cycles".format( - gold_train_data["n_cycles"], - "s" if gold_train_data["n_cycles"] > 1 else "", - ) + f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") @@ -511,36 +422,28 @@ def debug_data( warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: - msg.good( - "{} {} passed".format( - good_counts, "check" if good_counts == 1 else "checks" - ) - ) + msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed") if warn_counts: - msg.warn( - "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings") - ) - if fail_counts: - msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) - + msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: + msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1) def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": - with msg.loading("Loading {}...".format(file_name)): + with msg.loading(f"Loading {file_name}..."): data = srsly.read_json(file_path) - msg.good("Loaded {}".format(file_name)) + msg.good(f"Loaded {file_name}") return data elif file_path.suffix == ".jsonl": - with msg.loading("Loading {}...".format(file_name)): + with msg.loading(f"Loading {file_name}..."): data = srsly.read_jsonl(file_path) - msg.good("Loaded {}".format(file_name)) + msg.good(f"Loaded {file_name}") return data msg.fail( - "Can't load file extension {}".format(file_path.suffix), + f"Can't load file extension {file_path.suffix}", "Expected .json or .jsonl", exits=1, ) @@ -604,14 +507,18 @@ def _compile_gold(examples, pipeline): def _format_labels(labels, counts=False): if counts: - return ", ".join(["'{}' ({})".format(l, c) for l, c in labels]) - return ", ".join(["'{}'".format(l) for l in labels]) + return ", ".join([f"'{l}' ({c})" for l, c in labels]) + return ", ".join([f"'{l}'" for l in labels]) def _get_examples_without_label(data, label): count = 0 for ex in data: - labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)] + labels = [ + label.split("-")[1] + for label in ex.gold.ner + if label not in ("O", "-", None) + ] if label not in labels: count += 1 return count diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 19f3e7860..7c87a582a 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import requests import os @@ -50,7 +47,7 @@ def download(model, direct=False, *pip_args): sys.exit(dl) msg.good( "Download and installation successful", - "You can now load the model via spacy.load('{}')".format(model_name), + f"You can now load the model via spacy.load('{model_name}')", ) # Only create symlink if the model is installed via a shortcut like 'en'. # There's no real advantage over an additional symlink for en_core_web_sm @@ -69,10 +66,10 @@ def download(model, direct=False, *pip_args): # message and loading instructions, even if linking fails. msg.warn( "Download successful but linking failed", - "Creating a shortcut link for '{}' didn't work (maybe you " - "don't have admin permissions?), but you can still load " - "the model via its full package name: " - "nlp = spacy.load('{}')".format(model, model_name), + f"Creating a shortcut link for '{model}' didn't work (maybe you " + f"don't have admin permissions?), but you can still load " + f"the model via its full package name: " + f"nlp = spacy.load('{model_name}')", ) # If a model is downloaded and then loaded within the same process, our # is_package check currently fails, because pkg_resources.working_set @@ -95,11 +92,11 @@ def get_json(url, desc): r = requests.get(url) if r.status_code != 200: msg.fail( - "Server error ({})".format(r.status_code), - "Couldn't fetch {}. Please find a model for your spaCy " - "installation (v{}), and download it manually. For more " - "details, see the documentation: " - "https://spacy.io/usage/models".format(desc, about.__version__), + f"Server error ({r.status_code})", + f"Couldn't fetch {desc}. Please find a model for your spaCy " + f"installation (v{about.__version__}), and download it manually. " + f"For more details, see the documentation: " + f"https://spacy.io/usage/models", exits=1, ) return r.json() @@ -111,7 +108,7 @@ def get_compatibility(): comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: - msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1) + msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) return comp[version] @@ -119,8 +116,8 @@ def get_version(model, comp): model = model.rsplit(".dev", 1)[0] if model not in comp: msg.fail( - "No compatible model found for '{}' " - "(spaCy v{}).".format(model, about.__version__), + f"No compatible model found for '{model}' " + f"(spaCy v{about.__version__}).", exits=1, ) return comp[model][0] diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index da8a714a7..de2cb4d09 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - import plac from timeit import default_timer as timer from wasabi import msg @@ -79,7 +76,7 @@ def evaluate( deps=render_deps, ents=render_ents, ) - msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) + msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) if return_scores: return scorer.scores diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 080d0dc77..060a38e78 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import platform from pathlib import Path from wasabi import msg import srsly -from ..compat import path2str, basestring_, unicode_ from .. import util from .. import about @@ -33,12 +29,12 @@ def info(model=None, markdown=False, silent=False): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: - meta["link"] = path2str(model_path) - meta["source"] = path2str(model_path.resolve()) + meta["link"] = str(model_path) + meta["source"] = str(model_path.resolve()) else: - meta["source"] = path2str(model_path) + meta["source"] = str(model_path) if not silent: - title = "Info about model '{}'".format(model) + title = f"Info about model '{model}'" model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } @@ -49,7 +45,7 @@ def info(model=None, markdown=False, silent=False): return meta data = { "spaCy version": about.__version__, - "Location": path2str(Path(__file__).parent.parent), + "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), "Models": list_models(), @@ -84,9 +80,9 @@ def print_markdown(data, title=None): """ markdown = [] for key, value in data.items(): - if isinstance(value, basestring_) and Path(value).exists(): + if isinstance(value, str) and Path(value).exists(): continue - markdown.append("* **{}:** {}".format(key, unicode_(value))) + markdown.append(f"* **{key}:** {value}") if title: - print("\n## {}".format(title)) + print(f"\n## {title}") print("\n{}\n".format("\n".join(markdown))) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3fa0cc890..c3ef5267c 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import math from tqdm import tqdm @@ -91,8 +88,7 @@ def init_model( vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( - "Sucessfully compiled vocab", - "{} entries, {} vectors".format(lex_added, vec_added), + "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", ) if not output_dir.exists(): output_dir.mkdir() @@ -177,9 +173,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: - with msg.loading("Reading vectors from {}".format(vectors_loc)): + with msg.loading(f"Reading vectors from {vectors_loc}"): vectors_data, vector_keys = read_vectors(vectors_loc) - msg.good("Loaded vectors from {}".format(vectors_loc)) + msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) if vector_keys is not None: diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 8117829b5..df24adc23 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -1,11 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac from pathlib import Path from wasabi import msg -from ..compat import symlink_to, path2str +from ..compat import symlink_to from .. import util @@ -27,23 +24,23 @@ def link(origin, link_name, force=False, model_path=None): if not model_path.exists(): msg.fail( "Can't locate model data", - "The data should be located in {}".format(path2str(model_path)), + f"The data should be located in {model_path}", exits=1, ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent msg.fail( - "Can't find the spaCy data path to create model symlink", - "Make sure a directory `/data` exists within your spaCy " - "installation and try again. The data directory should be located " - "here:".format(path=spacy_loc), + f"Can't find the spaCy data path to create model symlink", + f"Make sure a directory `/data` exists within your spaCy " + f"installation and try again. The data directory should be located " + f"here: {spacy_loc}", exits=1, ) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: msg.fail( - "Link '{}' already exists".format(link_name), + f"Link '{link_name}' already exists", "To overwrite an existing link, use the --force flag", exits=1, ) @@ -54,18 +51,18 @@ def link(origin, link_name, force=False, model_path=None): elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". msg.fail( - "Can't overwrite symlink '{}'".format(link_name), + f"Can't overwrite symlink '{link_name}'", "This can happen if your data directory contains a directory or " "file of the same name.", exits=1, ) - details = "%s --> %s" % (path2str(model_path), path2str(link_path)) + details = f"{model_path} --> {link_path}" try: symlink_to(link_path, model_path) except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. msg.fail( - "Couldn't link model to '{}'".format(link_name), + f"Couldn't link model to '{link_name}'", "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv. You can still import the model as a module and " @@ -74,4 +71,4 @@ def link(origin, link_name, force=False, model_path=None): msg.text(details) raise msg.good("Linking successful", details) - msg.text("You can now load the model via spacy.load('{}')".format(link_name)) + msg.text(f"You can now load the model via spacy.load('{link_name}')") diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8ed92259c..8830a0ca2 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import shutil from pathlib import Path from wasabi import msg, get_raw_input import srsly -from ..compat import path2str from .. import util from .. import about @@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( - "No '{}' setting found in meta.json".format(key), + f"No '{key}' setting found in meta.json", "This setting is required to build your package.", exits=1, ) @@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals if package_path.exists(): if force: - shutil.rmtree(path2str(package_path)) + shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " - "`--force` flag to overwrite existing " - "directories.".format(path=path2str(package_path)), + "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) - shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + shutil.copytree(str(input_path), str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) - msg.good("Successfully created package '{}'".format(model_name_v), main_path) + msg.good(f"Successfully created package '{model_name_v}'", main_path) msg.text("To build the package, run `python setup.py sdist` in this directory.") @@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg): TEMPLATE_SETUP = """ #!/usr/bin/env python -# coding: utf8 -from __future__ import unicode_literals - import io import json from os import path, walk @@ -190,9 +182,6 @@ include meta.json TEMPLATE_INIT = """ -# coding: utf8 -from __future__ import unicode_literals - from pathlib import Path from spacy.util import load_model_from_init_py, get_model_meta diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 68038bc5c..75840923e 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import print_function, unicode_literals - import plac import random import numpy @@ -154,9 +151,9 @@ def pretrain( msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") - with msg.loading("Loading model '{}'...".format(vectors_model)): + with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) - msg.good("Loaded model '{}'".format(vectors_model)) + msg.good(f"Loaded model '{vectors_model}'") pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, @@ -173,7 +170,7 @@ def pretrain( # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text("Loaded pretrained tok2vec for: {}".format(components)) + msg.text(f"Loaded pretrained tok2vec for: {components}") # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: @@ -221,7 +218,9 @@ def pretrain( skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( - util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size) + util.minibatch_by_words( + (Example(doc=text) for text in texts), size=batch_size + ) ): docs, count = make_docs( nlp, @@ -246,7 +245,7 @@ def pretrain( # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: - msg.warn("Skipped {count} empty values".format(count=str(skip_counter))) + msg.warn(f"Skipped {skip_counter} empty values") msg.good("Successfully finished pretrain") diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 4ee72fc23..f3df0817d 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - import plac import tqdm from pathlib import Path @@ -34,11 +31,11 @@ def profile(model, inputs=None, n_texts=10000): with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = thinc.extra.datasets.imdb() inputs, _ = zip(*imdb_train) - msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) + msg.info(f"Loaded IMDB dataset and using {n_inputs} examples") inputs = inputs[:n_inputs] - with msg.loading("Loading model '{}'...".format(model)): + with msg.loading(f"Loading model '{model}'..."): nlp = load_model(model) - msg.good("Loaded model '{}'".format(model)) + msg.good(f"Loaded model '{model}'") texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") @@ -60,7 +57,7 @@ def _read_inputs(loc, msg): input_path = Path(loc) if not input_path.exists() or not input_path.is_file(): msg.fail("Not a valid input data file", loc, exits=1) - msg.info("Using data from {}".format(input_path.parts[-1])) + msg.info(f"Using data from {input_path.parts[-1]}") file_ = input_path.open() for line in file_: data = srsly.json_loads(line) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index daa90f022..e8662a101 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - import plac import os import tqdm @@ -12,12 +9,10 @@ import srsly from wasabi import msg import contextlib import random -from collections import OrderedDict from .._ml import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus -from ..compat import path2str from .. import util from .. import about @@ -148,14 +143,14 @@ def train( # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] - msg.text("Training pipeline: {}".format(pipeline)) + msg.text(f"Training pipeline: {pipeline}") if base_model: - msg.text("Starting with base model '{}'".format(base_model)) + msg.text(f"Starting with base model '{base_model}'") nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( - "Model language ('{}') doesn't match language specified as " - "`lang` argument ('{}') ".format(nlp.lang, lang), + f"Model language ('{nlp.lang}') doesn't match language " + f"specified as `lang` argument ('{lang}') ", exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) @@ -187,15 +182,13 @@ def train( } if base_cfg != pipe_cfg: msg.fail( - "The base textcat model configuration does" - "not match the provided training options. " - "Existing cfg: {}, provided cfg: {}".format( - base_cfg, pipe_cfg - ), + f"The base textcat model configuration does" + f"not match the provided training options. " + f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", exits=1, ) else: - msg.text("Starting with blank model '{}'".format(lang)) + msg.text(f"Starting with blank model '{lang}'") lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: @@ -215,7 +208,7 @@ def train( nlp.vocab.morphology.tag_map.update(tag_map) if vectors: - msg.text("Loading vector from model '{}'".format(vectors)) + msg.text(f"Loading vector from model '{vectors}'") _load_vectors(nlp, vectors) # Multitask objectives @@ -224,15 +217,15 @@ def train( if multitasks: if pipe_name not in pipeline: msg.fail( - "Can't use multitask objective without '{}' in the " - "pipeline".format(pipe_name) + f"Can't use multitask objective without '{pipe_name}' in " + f"the pipeline" ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus - msg.text("Counting training words (limit={})".format(n_examples)) + msg.text(f"Counting training words (limit={n_examples})") corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() @@ -248,22 +241,22 @@ def train( # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text("Loaded pretrained tok2vec for: {}".format(components)) + msg.text(f"Loaded pretrained tok2vec for: {components}") # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( - "The textcat_positive_label (tpl) '{}' does not match any " - "label in the training data.".format(textcat_positive_label), + f"The textcat_positive_label (tpl) '{textcat_positive_label}' " + f"does not match any label in the training data.", exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( - "A textcat_positive_label (tpl) '{}' was provided for training " - "data that does not appear to be a binary classification " - "problem with two labels.".format(textcat_positive_label), + "A textcat_positive_label (tpl) '{textcat_positive_label}' was " + "provided for training data that does not appear to be a " + "binary classification problem with two labels.", exits=1, ) train_data = corpus.train_data( @@ -302,20 +295,20 @@ def train( break if base_model and set(textcat_labels) != train_labels: msg.fail( - "Cannot extend textcat model using data with different " - "labels. Base model labels: {}, training data labels: " - "{}.".format(textcat_labels, list(train_labels)), + f"Cannot extend textcat model using data with different " + f"labels. Base model labels: {textcat_labels}, training data " + f"labels: {list(train_labels)}", exits=1, ) if textcat_multilabel: msg.text( - "Textcat evaluation score: ROC AUC score macro-averaged across " - "the labels '{}'".format(", ".join(textcat_labels)) + f"Textcat evaluation score: ROC AUC score macro-averaged across " + f"the labels '{', '.join(textcat_labels)}'" ) elif textcat_positive_label and len(textcat_labels) == 2: msg.text( - "Textcat evaluation score: F1-score for the " - "label '{}'".format(textcat_positive_label) + f"Textcat evaluation score: F1-score for the " + f"label '{textcat_positive_label}'" ) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: @@ -325,8 +318,8 @@ def train( "an evaluation on the positive class." ) msg.text( - "Textcat evaluation score: F1-score macro-averaged across " - "the labels '{}'".format(", ".join(textcat_labels)) + f"Textcat evaluation score: F1-score macro-averaged across " + f"the labels '{', '.join(textcat_labels)}'" ) else: msg.fail( @@ -471,8 +464,8 @@ def train( for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( - "Textcat ROC AUC score is undefined due to " - "only one value in label '{}'.".format(cat) + f"Textcat ROC AUC score is undefined due to " + f"only one value in label '{cat}'." ) msg.row(progress, **row_settings) # Early stopping @@ -485,12 +478,10 @@ def train( best_score = current_score if iter_since_best >= n_early_stopping: msg.text( - "Early stopping, best iteration " - "is: {}".format(i - iter_since_best) + f"Early stopping, best iteration is: {i - iter_since_best}" ) msg.text( - "Best score = {}; Final iteration " - "score = {}".format(best_score, current_score) + f"Best score = {best_score}; Final iteration score = {current_score}" ) break finally: @@ -560,11 +551,11 @@ def _collate_best_model(meta, output_path, components): for component in components: bests[component] = _find_best(output_path, component) best_dest = output_path / "model-best" - shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest)) + shutil.copytree(str(output_path / "model-final"), str(best_dest)) for component, best_component_src in bests.items(): - shutil.rmtree(path2str(best_dest / component)) + shutil.rmtree(str(best_dest / component)) shutil.copytree( - path2str(best_component_src / component), path2str(best_dest / component) + str(best_component_src / component), str(best_dest / component) ) accs = srsly.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): @@ -627,10 +618,8 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): if has_beam_widths: row_head.insert(1, "Beam W.") # remove duplicates - row_head_dict = OrderedDict() - row_head_dict.update({k: 1 for k in row_head}) - output_stats_dict = OrderedDict() - output_stats_dict.update({k: 1 for k in output_stats}) + row_head_dict = {k: 1 for k in row_head} + output_stats_dict = {k: 1 for k in output_stats} return row_head_dict.keys(), output_stats_dict.keys() diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 93abad6f6..b4d217f2f 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - from pathlib import Path import sys import requests import srsly from wasabi import msg -from ..compat import path2str from ..util import get_data_path from .. import about @@ -21,7 +17,7 @@ def validate(): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( - "Server error ({})".format(r.status_code), + f"Server error ({r.status_code})", "Couldn't fetch compatibility table.", exits=1, ) @@ -32,7 +28,7 @@ def validate(): current_compat = compat.get(version) if not current_compat: msg.fail( - "Can't find spaCy v{} in compatibility table".format(version), + f"Can't find spaCy v{version} in compatibility table", about.__compatibility__, exits=1, ) @@ -52,8 +48,8 @@ def validate(): update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent - msg.divider("Installed models (spaCy v{})".format(about.__version__)) - msg.info("spaCy installation: {}".format(path2str(spacy_dir))) + msg.divider(f"Installed models (spaCy v{about.__version__})") + msg.info(f"spaCy installation: {spacy_dir}") if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") @@ -72,15 +68,15 @@ def validate(): print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text( - "The following models are not available for spaCy " - "v{}: {}".format(about.__version__, ", ".join(na_models)) + f"The following models are not available for spaCy " + f"v{about.__version__}: {', '.join(na_models)}" ) if incompat_links: msg.text( - "You may also want to overwrite the incompatible links using the " - "`python -m spacy link` command with `--force`, or remove them " - "from the data directory. " - "Data path: {path}".format(path=path2str(get_data_path())) + f"You may also want to overwrite the incompatible links using the " + f"`python -m spacy link` command with `--force`, or remove them " + f"from the data directory. " + f"Data path: {get_data_path()}" ) if incompat_models or incompat_links: sys.exit(1) @@ -128,7 +124,7 @@ def get_model_row(compat, name, data, msg, model_type="package"): version = msg.text(data["version"], color="green", no_print=True) else: version = msg.text(data["version"], color="red", no_print=True) - comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0]) + comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" return (model_type, name, data["name"], version, comp) diff --git a/spacy/compat.py b/spacy/compat.py index 0ea31c6b3..8cb08ae09 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -1,4 +1,3 @@ -# coding: utf8 """ Helpers for Python and platform compatibility. To distinguish them from the builtin functions, replacement functions are suffixed with an underscore, @@ -6,13 +5,8 @@ e.g. `unicode_`. DOCS: https://spacy.io/api/top-level#compat """ -from __future__ import unicode_literals - import os import sys -import itertools -import ast -import types from thinc.neural.util import copy_array @@ -46,45 +40,11 @@ copy_reg = copy_reg CudaStream = CudaStream cupy = cupy copy_array = copy_array -izip = getattr(itertools, "izip", zip) is_windows = sys.platform.startswith("win") is_linux = sys.platform.startswith("linux") is_osx = sys.platform == "darwin" -# See: https://github.com/benjaminp/six/blob/master/six.py -is_python2 = sys.version_info[0] == 2 -is_python3 = sys.version_info[0] == 3 -is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5) - -if is_python2: - bytes_ = str - unicode_ = unicode # noqa: F821 - basestring_ = basestring # noqa: F821 - input_ = raw_input # noqa: F821 - path2str = lambda path: str(path).decode("utf8") - class_types = (type, types.ClassType) - -elif is_python3: - bytes_ = bytes - unicode_ = str - basestring_ = str - input_ = input - path2str = lambda path: str(path) - class_types = (type, types.ClassType) if is_python_pre_3_5 else type - - -def b_to_str(b_str): - """Convert a bytes object to a string. - - b_str (bytes): The object to convert. - RETURNS (unicode): The converted string. - """ - if is_python2: - return b_str - # Important: if no encoding is set, string becomes "b'...'" - return str(b_str, encoding="utf8") - def symlink_to(orig, dest): """Create a symlink. Used for model shortcut links. @@ -95,9 +55,7 @@ def symlink_to(orig, dest): if is_windows: import subprocess - subprocess.check_call( - ["mklink", "/d", path2str(orig), path2str(dest)], shell=True - ) + subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True) else: orig.symlink_to(dest) @@ -108,19 +66,17 @@ def symlink_remove(link): link (unicode / Path): The path to the symlink. """ # https://stackoverflow.com/q/26554135/6400719 - if os.path.isdir(path2str(link)) and is_windows: + if os.path.isdir(str(link)) and is_windows: # this should only be on Py2.7 and windows - os.rmdir(path2str(link)) + os.rmdir(str(link)) else: - os.unlink(path2str(link)) + os.unlink(str(link)) -def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): +def is_config(windows=None, linux=None, osx=None, **kwargs): """Check if a specific configuration of Python version and operating system matches the user's setup. Mostly used to display targeted error messages. - python2 (bool): spaCy is executed with Python 2.x. - python3 (bool): spaCy is executed with Python 3.x. windows (bool): spaCy is executed on Windows. linux (bool): spaCy is executed on Linux. osx (bool): spaCy is executed on OS X or macOS. @@ -129,53 +85,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): DOCS: https://spacy.io/api/top-level#compat.is_config """ return ( - python2 in (None, is_python2) - and python3 in (None, is_python3) - and windows in (None, is_windows) + windows in (None, is_windows) and linux in (None, is_linux) and osx in (None, is_osx) ) - - -def import_file(name, loc): - """Import module from a file. Used to load models from a directory. - - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. - RETURNS: The loaded module. - """ - loc = path2str(loc) - if is_python_pre_3_5: - import imp - - return imp.load_source(name, loc) - else: - import importlib.util - - spec = importlib.util.spec_from_file_location(name, str(loc)) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def unescape_unicode(string): - """Python2.7's re module chokes when compiling patterns that have ranges - between escaped unicode codepoints if the two codepoints are unrecognised - in the unicode database. For instance: - - re.compile('[\\uAA77-\\uAA79]').findall("hello") - - Ends up matching every character (on Python 2). This problem doesn't occur - if we're dealing with unicode literals. - """ - if string is None: - return string - # We only want to unescape the unicode, so we first must protect the other - # backslashes. - string = string.replace("\\", "\\\\") - # Now we remove that protection for the unicode. - string = string.replace("\\\\u", "\\u") - string = string.replace("\\\\U", "\\U") - # Now we unescape by evaling the string with the AST. This can't execute - # code -- it only does the representational level. - return ast.literal_eval("u'''" + string + "'''") diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index d2ef21dbd..d804757ef 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -1,15 +1,11 @@ -# coding: utf8 """ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ -from __future__ import unicode_literals - from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span -from ..compat import b_to_str from ..errors import Errors, Warnings, user_warning from ..util import is_in_jupyter @@ -92,20 +88,20 @@ def serve( render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) - print("\nUsing the '{}' visualizer".format(style)) - print("Serving on http://{}:{} ...\n".format(host, port)) + print(f"\nUsing the '{style}' visualizer") + print(f"Serving on http://{host}:{port} ...\n") try: httpd.serve_forever() except KeyboardInterrupt: - print("Shutting down server on port {}.".format(port)) + print(f"Shutting down server on port {port}.") finally: httpd.server_close() def app(environ, start_response): # Headers and status need to be bytes in Python 2, see #1227 - headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))] - start_response(b_to_str(b"200 OK"), headers) + headers = [("Content-type", "text/html; charset=utf-8")] + start_response("200 OK", headers) res = _html["parsed"].encode(encoding="utf-8") return [res] diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d6e33437b..7ca1eebb7 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import uuid from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS @@ -55,7 +52,7 @@ class DependencyRenderer(object): settings = p.get("settings", {}) self.direction = settings.get("direction", DEFAULT_DIR) self.lang = settings.get("lang", DEFAULT_LANG) - render_id = "{}-{}".format(id_prefix, i) + render_id = f"{id_prefix}-{i}" svg = self.render_svg(render_id, p["words"], p["arcs"]) rendered.append(svg) if page: diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ade75d1d6..d6970aa2f 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Setting explicit height and max-width: none on the SVG is required for # Jupyter to render it properly in a cell diff --git a/spacy/errors.py b/spacy/errors.py index 3dab4e1fb..81747b33b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import os import warnings import inspect @@ -12,7 +9,7 @@ def add_codes(err_cls): class ErrorsWithCodes(object): def __getattribute__(self, code): msg = getattr(err_cls, code) - return "[{code}] {msg}".format(code=code, msg=msg) + return f"[{code}] {msg}" return ErrorsWithCodes() @@ -98,8 +95,6 @@ class Warnings(object): "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed.") - W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " - "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " @@ -550,6 +545,7 @@ class Errors(object): E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") + @add_codes class TempErrors(object): T003 = ("Resizing pretrained Tagger models is not currently supported.") @@ -573,10 +569,10 @@ class MatchPatternError(ValueError): errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ - msg = "Invalid token patterns for matcher rule '{}'\n".format(key) + msg = f"Invalid token patterns for matcher rule '{key}'\n" for pattern_idx, error_msgs in errors.items(): - pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs]) - msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors) + pattern_errors = "\n".join([f"- {e}" for e in error_msgs]) + msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n" ValueError.__init__(self, msg) diff --git a/spacy/glossary.py b/spacy/glossary.py index 44a8277da..5e7e531a9 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - def explain(term): """Get a description for a given POS tag, dependency label or entity type. diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0374825dc..e3af40d4d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,7 +1,4 @@ # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals, print_function - import re import random import numpy @@ -14,7 +11,6 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span from .errors import Errors, AlignmentError, user_warning, Warnings -from .compat import path2str, basestring_ from . import util @@ -157,7 +153,7 @@ class GoldCorpus(object): self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) def __del__(self): - shutil.rmtree(path2str(self.tmp_dir)) + shutil.rmtree(self.tmp_dir) @staticmethod def write_msgpack(directory, examples, limit=0): @@ -167,7 +163,7 @@ class GoldCorpus(object): for i, example in enumerate(examples): ex_dict = example.to_dict() text = example.text - srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) + srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) n += 1 if limit and n >= limit: break @@ -221,7 +217,7 @@ class GoldCorpus(object): examples = [Example.from_dict(ex_dict, doc=text)] else: supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) + raise ValueError(Errors.E124.format(path=loc, formats=supported)) for example in examples: yield example i += 1 @@ -862,7 +858,7 @@ cdef class Example: converted_examples = [] for ex in examples: # convert string to Doc to Example - if isinstance(ex, basestring_): + if isinstance(ex, str): if keep_raw_text: converted_examples.append(Example(doc=ex)) else: @@ -876,7 +872,7 @@ cdef class Example: doc, gold = ex gold_dict = {} # convert string to Doc - if isinstance(doc, basestring_) and not keep_raw_text: + if isinstance(doc, str) and not keep_raw_text: doc = make_doc(doc) # convert dict to GoldParse if isinstance(gold, dict): @@ -988,7 +984,7 @@ cdef class GoldParse: # Translate the None values to '-', to make processing easier. # See Issue #2603 entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], basestring_): + if not isinstance(entities[0], str): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -1107,7 +1103,7 @@ cdef class GoldParse: cycle = nonproj.contains_cycle(self.heads) if cycle is not None: raise ValueError(Errors.E069.format(cycle=cycle, - cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), + cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) def __len__(self): diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..1129fa860 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,22 +1,17 @@ # cython: infer_types=True # cython: profile=True -# coding: utf8 -from spacy.errors import Errors, Warnings, user_warning - from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from cpython.exc cimport PyErr_SetFromErrno - from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t - -from .typedefs cimport hash_t - from os import path from libcpp.vector cimport vector +from .typedefs cimport hash_t +from .errors import Errors, Warnings, user_warning + cdef class Candidate: """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved @@ -584,5 +579,3 @@ cdef class Reader: cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status - - diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py index 90ea324f0..0da123419 100644 --- a/spacy/lang/af/__init__.py +++ b/spacy/lang/af/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index 2b3bcc019..dfd144de9 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-af diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index c120703f6..6a1a8af3a 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ar/examples.py b/spacy/lang/ar/examples.py index 2a10f4fcc..a51bb9ded 100644 --- a/spacy/lang/ar/examples.py +++ b/spacy/lang/ar/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py index 19e7aef8a..54ad7a8c3 100644 --- a/spacy/lang/ar/lex_attrs.py +++ b/spacy/lang/ar/lex_attrs.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...attrs import LIKE_NUM _num_words = set( diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py index 6625c5475..f30204c02 100644 --- a/spacy/lang/ar/punctuation.py +++ b/spacy/lang/ar/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py index de2fc7443..f4da54dda 100644 --- a/spacy/lang/ar/stop_words.py +++ b/spacy/lang/ar/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ من diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py index 030daecd5..a11f3b43a 100644 --- a/spacy/lang/ar/tokenizer_exceptions.py +++ b/spacy/lang/ar/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index 9b4c647e3..437feb9ed 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/bg/examples.py b/spacy/lang/bg/examples.py index b08b8926d..a6d40da1a 100644 --- a/spacy/lang/bg/examples.py +++ b/spacy/lang/bg/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index e7c65cbc2..45a252bc9 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Alir3z4/stop-words diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index e70232552..901676554 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py index 2d5bdb238..051e59d84 100644 --- a/spacy/lang/bn/examples.py +++ b/spacy/lang/bn/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bn/morph_rules.py b/spacy/lang/bn/morph_rules.py index 21a76c7e6..44d6108e9 100644 --- a/spacy/lang/bn/morph_rules.py +++ b/spacy/lang/bn/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index f624b4ba4..becfe8d2a 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 6c9967df8..6bcd06b37 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/bn/tag_map.py b/spacy/lang/bn/tag_map.py index 1efb35858..36d69ccf9 100644 --- a/spacy/lang/bn/tag_map.py +++ b/spacy/lang/bn/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index 32acb1730..18e313a25 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding=utf-8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 6d4c00a6b..a1ff2f2df 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py index 3020ee707..3fbf1fb0a 100644 --- a/spacy/lang/ca/examples.py +++ b/spacy/lang/ca/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py index 6314efa92..be8b7a6ea 100644 --- a/spacy/lang/ca/lex_attrs.py +++ b/spacy/lang/ca/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py index 4439376c8..d50b75589 100644 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py index a803db2a5..1a87b2f9d 100644 --- a/spacy/lang/ca/stop_words.py +++ b/spacy/lang/ca/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò diff --git a/spacy/lang/ca/tag_map.py b/spacy/lang/ca/tag_map.py index 472e772ef..1ecbddc49 100644 --- a/spacy/lang/ca/tag_map.py +++ b/spacy/lang/ca/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index d95e5e626..5a9d9055a 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 2c8823867..73f48e49a 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - split_chars = lambda char: list(char.strip().split(" ")) merge_chars = lambda char: char.strip().replace(" ", "|") group_chars = lambda char: char.strip().replace(" ", "") diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py index 5b1397ba2..a27e3339d 100644 --- a/spacy/lang/cs/__init__.py +++ b/spacy/lang/cs/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index 59d3c102e..e8171a7e5 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Alir3z4/stop-words diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index ac8c04954..2828c014b 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index b535191a1..e5c6448f0 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py index 9fefc1eba..403af686c 100644 --- a/spacy/lang/da/lex_attrs.py +++ b/spacy/lang/da/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py index 7ffe2ac6f..06704f482 100644 --- a/spacy/lang/da/morph_rules.py +++ b/spacy/lang/da/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py index dbffdb88b..c689500f4 100644 --- a/spacy/lang/da/norm_exceptions.py +++ b/spacy/lang/da/norm_exceptions.py @@ -1,10 +1,7 @@ -# coding: utf8 """ Special-case rules for normalizing tokens to improve the model's predictions. For example 'mysterium' vs 'mysterie' and similar. """ -from __future__ import unicode_literals - # Sources: # 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/ diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py index b6b852c55..e050ab7aa 100644 --- a/spacy/lang/da/punctuation.py +++ b/spacy/lang/da/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index 48de0c7ca..05b2084dd 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: Handpicked by Jens Dahl Møllerhøj. STOP_WORDS = set( diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index d669fb981..64eba819f 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,11 +1,7 @@ -# encoding: utf8 """ Tokenizer Exceptions. Source: https://forkortelse.dk/ and various others. """ - -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1412f033a..8478b6f23 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py index 0c64a693a..530ece629 100644 --- a/spacy/lang/de/examples.py +++ b/spacy/lang/de/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py index 3dbd4c7e3..6ad5b62a7 100644 --- a/spacy/lang/de/norm_exceptions.py +++ b/spacy/lang/de/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Here we only want to include the absolute most common words. Otherwise, # this list would get impossibly long for German – especially considering the # old vs. new spelling rules, and all possible cases. diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index 7dfa61bd4..72f7e1022 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index cf3204d5e..df708e22e 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 89d784a0c..410d2f0b4 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index c169501a9..ca7ec61f1 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 5b09a0b89..3dd8507bc 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 16863e6d7..1ef7c503f 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map_general import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/el/examples.py b/spacy/lang/el/examples.py index 521e7b30d..62515c07a 100644 --- a/spacy/lang/el/examples.py +++ b/spacy/lang/el/examples.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.el.examples import sentences diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py index f41833974..01deb23a2 100644 --- a/spacy/lang/el/get_pos_from_wiktionary.py +++ b/spacy/lang/el/get_pos_from_wiktionary.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - def get_pos_from_wiktionary(): import re diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 6f5b3999b..cf3a7fe97 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer diff --git a/spacy/lang/el/lex_attrs.py b/spacy/lang/el/lex_attrs.py index cf32fe12c..5c8f96848 100644 --- a/spacy/lang/el/lex_attrs.py +++ b/spacy/lang/el/lex_attrs.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py index d4384ff3c..d540aae2c 100644 --- a/spacy/lang/el/norm_exceptions.py +++ b/spacy/lang/el/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Norms are only set if no alternative is provided in the tokenizer exceptions. diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py index fbf773f4d..2d5690407 100644 --- a/spacy/lang/el/punctuation.py +++ b/spacy/lang/el/punctuation.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS from ..char_classes import CONCAT_QUOTES, CURRENCY diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index f13c47ec2..8484826d1 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5dfd44f07..988a36c80 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py index b346299bc..adfacd025 100644 --- a/spacy/lang/el/tag_map.py +++ b/spacy/lang/el/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX diff --git a/spacy/lang/el/tag_map_general.py b/spacy/lang/el/tag_map_general.py index 42e64a013..d7e89d43a 100644 --- a/spacy/lang/el/tag_map_general.py +++ b/spacy/lang/el/tag_map_general.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index a3c36542e..27ae1fe3a 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index fca4e01e7..fa01e2b60 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py index 946289c7c..0363a45e7 100644 --- a/spacy/lang/en/examples.py +++ b/spacy/lang/en/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py index f92d41139..96fb4c9fa 100644 --- a/spacy/lang/en/lex_attrs.py +++ b/spacy/lang/en/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index 5ed4eac59..aa3e6ce57 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # Several entries here look pretty suspicious. These will get the POS SCONJ diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py index a2cf58b8a..431d9c049 100644 --- a/spacy/lang/en/norm_exceptions.py +++ b/spacy/lang/en/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Slang and abbreviations diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 3505b13bf..4573c9411 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words STOP_WORDS = set( diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index ed665ef29..86695cf6f 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index ecb3103cc..2078798f7 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index c45197771..776948c28 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 80cc1727c..060bd8fc6 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..1c1ad631b 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 03ada1f43..d2a3c891a 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 20e929b48..3d46a88cb 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 6a78d86f7..e998cd1d6 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON, VERB, AUX diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 7a7c9d549..1748162c0 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 9109d658b..1cd5941be 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py index d84c081ef..e0b0a8a87 100644 --- a/spacy/lang/et/__init__.py +++ b/spacy/lang/et/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index 15070db5f..3b600a158 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-et diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 9d85f814a..aa02855e9 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py index 3f65a366d..d89feb6c8 100644 --- a/spacy/lang/fa/examples.py +++ b/spacy/lang/fa/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index 5d0ff944d..61586dc3f 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - verb_roots = """ #هست diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py index dbea66b68..99b8e2787 100644 --- a/spacy/lang/fa/lex_attrs.py +++ b/spacy/lang/fa/lex_attrs.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...attrs import LIKE_NUM diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py index 33aa46ae2..4b258c13d 100644 --- a/spacy/lang/fa/punctuation.py +++ b/spacy/lang/fa/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index 682fb7a71..372422b67 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words from HAZM package STOP_WORDS = set( diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index ed665ef29..86695cf6f 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/fa/tag_map.py b/spacy/lang/fa/tag_map.py index b9043adf0..f1f106915 100644 --- a/spacy/lang/fa/tag_map.py +++ b/spacy/lang/fa/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import PRON, NOUN, PART, INTJ, AUX diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py index b3f8dcbf5..db9e3f6fc 100644 --- a/spacy/lang/fa/tokenizer_exceptions.py +++ b/spacy/lang/fa/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 45d2f886f..db58ad3ba 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/fi/examples.py b/spacy/lang/fi/examples.py index 88be248a6..930fac273 100644 --- a/spacy/lang/fi/examples.py +++ b/spacy/lang/fi/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.fi.examples import sentences diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py index e960b55eb..4d500cead 100644 --- a/spacy/lang/fi/lex_attrs.py +++ b/spacy/lang/fi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py index 02eb1b200..878c8e250 100644 --- a/spacy/lang/fi/punctuation.py +++ b/spacy/lang/fi/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index e8e39ec6f..642cfc369 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index d74deb22b..44360e969 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f56c8688a..dc45e538c 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py index c9fcfff2d..7f908dac8 100644 --- a/spacy/lang/fr/_tokenizer_exceptions_list.py +++ b/spacy/lang/fr/_tokenizer_exceptions_list.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - FR_BASE_EXCEPTIONS = [ "(+)-amphétamine", "(5R,6S)-7,8-didehydro-4,5-époxy-3-méthoxy-N-méthylmorphinan-6-ol", diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py index a874c22fc..57d57f4a6 100644 --- a/spacy/lang/fr/examples.py +++ b/spacy/lang/fr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index 79f4dd28d..84e55d509 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import SCONJ, CCONJ diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py index e3ccd9fdd..da98c6e37 100644 --- a/spacy/lang/fr/lex_attrs.py +++ b/spacy/lang/fr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..5f42e7f25 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ae8432043..9c12e49a3 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/fr/tag_map.py b/spacy/lang/fr/tag_map.py index 93b43c2ec..2b1b20c52 100644 --- a/spacy/lang/fr/tag_map.py +++ b/spacy/lang/fr/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4b3b2c908..b1c0a53af 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from .punctuation import ELISION, HYPHENS @@ -70,7 +67,7 @@ for verb, verb_lemma in [ ]: for orth in [verb, verb.title()]: for pronoun in ["elle", "il", "on"]: - token = "{}-t-{}".format(orth, pronoun) + token = f"{orth}-t-{pronoun}" _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, @@ -79,7 +76,7 @@ for verb, verb_lemma in [ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: - token = "{}-ce".format(orth) + token = f"{orth}-ce" _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 42b4d0d18..cea7c0e94 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index 2133f0d22..c8cd36835 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # fmt: off consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py index d8f705b59..4ef052ca5 100644 --- a/spacy/lang/ga/stop_words.py +++ b/spacy/lang/ga/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a ach ag agus an aon ar arna as diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py index 1d8284014..baf64c1b8 100644 --- a/spacy/lang/ga/tag_map.py +++ b/spacy/lang/ga/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # fmt: off TAG_MAP = { "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index c0e53f522..0c587c67e 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index 411cdf107..0d324f64c 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py index 34cd157ae..29075c7d4 100644 --- a/spacy/lang/he/examples.py +++ b/spacy/lang/he/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py index a01ec4246..2745460a7 100644 --- a/spacy/lang/he/stop_words.py +++ b/spacy/lang/he/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ אני diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py index b0d45ddf3..9a96de95c 100644 --- a/spacy/lang/hi/__init__.py +++ b/spacy/lang/hi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py index 1dd182532..7639ff940 100644 --- a/spacy/lang/hi/examples.py +++ b/spacy/lang/hi/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py index 12666d96a..20a8c2975 100644 --- a/spacy/lang/hi/lex_attrs.py +++ b/spacy/lang/hi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..norm_exceptions import BASE_NORMS from ...attrs import NORM, LIKE_NUM diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index efad18c84..142fc6f47 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 539b164d7..fbc66ece0 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/hr/examples.py b/spacy/lang/hr/examples.py index dc52ce4f0..b28fb63c2 100644 --- a/spacy/lang/hr/examples.py +++ b/spacy/lang/hr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py index 408b802c5..dd10f792d 100644 --- a/spacy/lang/hr/stop_words.py +++ b/spacy/lang/hr/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-hr STOP_WORDS = set( """ diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index a331adc5b..df3fe4a44 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py index 3267887fe..b60f752ec 100644 --- a/spacy/lang/hu/examples.py +++ b/spacy/lang/hu/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..1fea6d510 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index c9a217dd6..024af68f4 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..cc5eede17 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from ..punctuation import ALPHA_LOWER, CURRENCY diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ea8e355ac..89f874abe 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py index fec878d5a..a0b35fa1a 100644 --- a/spacy/lang/id/_tokenizer_exceptions_list.py +++ b/spacy/lang/id/_tokenizer_exceptions_list.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - ID_BASE_EXCEPTIONS = set( """ aba-aba diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py index 56ac9165e..2ce46ce5a 100644 --- a/spacy/lang/id/examples.py +++ b/spacy/lang/id/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index 1d4584ae3..3167f4659 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata from .punctuation import LIST_CURRENCY diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py index 09ac6a6d3..63d2081e9 100644 --- a/spacy/lang/id/norm_exceptions.py +++ b/spacy/lang/id/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Daftar kosakata yang sering salah dieja # https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja _exc = { diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py index e4794d42b..f6c2387d8 100644 --- a/spacy/lang/id/punctuation.py +++ b/spacy/lang/id/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py index 0a9f91947..b1bfaea79 100644 --- a/spacy/lang/id/stop_words.py +++ b/spacy/lang/id/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index 16391a840..3bd08e96a 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PRON, AUX, SCONJ, INTJ, PART, PROPN diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 86fe611bf..5259bddf8 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py index 18e41432d..cdcfd6e71 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/is/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index e4ae0498b..5b3ff2f5a 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Xangis/extra-stopwords diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 90763eda5..4b223582b 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py index af66b7eca..30327bd14 100644 --- a/spacy/lang/it/examples.py +++ b/spacy/lang/it/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4fa931fde..0b8405cc0 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 84233d381..5cd1af137 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/it/tag_map.py b/spacy/lang/it/tag_map.py index 798c45d80..ce0e1d9ee 100644 --- a/spacy/lang/it/tag_map.py +++ b/spacy/lang/it/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 62f568c5c..f1cfba2c0 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...symbols import ORTH, LEMMA _exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..d1ce651d7 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - import re from collections import namedtuple diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py index e00001ed5..1d532ad77 100644 --- a/spacy/lang/ja/examples.py +++ b/spacy/lang/ja/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py index bb232a2d2..98560d7e2 100644 --- a/spacy/lang/ja/stop_words.py +++ b/spacy/lang/ja/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # This list was created by taking the top 2000 words from a Wikipedia dump and # filtering out everything that wasn't hiragana. ー (one) was also added. # Considered keeping some non-hiragana words but too many place names were diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..d922cd22b 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index c86354248..ef3b10f81 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index 652341e73..cfeb0e69d 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index ec79a95ab..4ecdfbc58 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from ...attrs import LANG diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py index 7885ad801..cc0a66c0a 100644 --- a/spacy/lang/ko/examples.py +++ b/spacy/lang/ko/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py index 1904a0ece..ac5bc7e48 100644 --- a/spacy/lang/ko/lex_attrs.py +++ b/spacy/lang/ko/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py index 676dca1b4..3eba9fc82 100644 --- a/spacy/lang/ko/stop_words.py +++ b/spacy/lang/ko/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ 이 diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py index 57317c969..26a8c56b9 100644 --- a/spacy/lang/ko/tag_map.py +++ b/spacy/lang/ko/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON from ...symbols import VERB, ADV, PROPN, NUM, DET diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 4fcfaddb4..afcf77f33 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/lb/examples.py b/spacy/lang/lb/examples.py index 3cbba31d9..a7a10489c 100644 --- a/spacy/lang/lb/examples.py +++ b/spacy/lang/lb/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py index e38c74974..d2d50d9dc 100644 --- a/spacy/lang/lb/lex_attrs.py +++ b/spacy/lang/lb/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py index 7063e6863..afc384228 100644 --- a/spacy/lang/lb/norm_exceptions.py +++ b/spacy/lang/lb/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # TODO # norm execptions: find a possibility to deal with the zillions of spelling # variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 1571e13d7..4886b316c 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "") diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py index 41e6f79d2..8f22ea6e6 100644 --- a/spacy/lang/lb/stop_words.py +++ b/spacy/lang/lb/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ a diff --git a/spacy/lang/lb/tag_map.py b/spacy/lang/lb/tag_map.py index 424a83bb4..cd2e8b93c 100644 --- a/spacy/lang/lb/tag_map.py +++ b/spacy/lang/lb/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PART, SPACE, AUX diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index b32daa58c..ebf624281 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM # TODO diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 7c0ed8a04..339290d4a 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata import re diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 7919a4858..0f096a5b7 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py index 99dbe9d4d..b2889114c 100644 --- a/spacy/lang/lt/examples.py +++ b/spacy/lang/lt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lt/lex_attrs.py b/spacy/lang/lt/lex_attrs.py index 81879948f..28894a59b 100644 --- a/spacy/lang/lt/lex_attrs.py +++ b/spacy/lang/lt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = { diff --git a/spacy/lang/lt/morph_rules.py b/spacy/lang/lt/morph_rules.py index 3bf26d9d8..f7bfd3cc6 100644 --- a/spacy/lang/lt/morph_rules.py +++ b/spacy/lang/lt/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/lt/stop_words.py b/spacy/lang/lt/stop_words.py index fed05d80d..8c11b3f7b 100644 --- a/spacy/lang/lt/stop_words.py +++ b/spacy/lang/lt/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = { "a", "abejais", diff --git a/spacy/lang/lt/tag_map.py b/spacy/lang/lt/tag_map.py index 6ea4f8ae0..f08db535f 100644 --- a/spacy/lang/lt/tag_map.py +++ b/spacy/lang/lt/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADJ, ADP, ADV, CONJ, INTJ, NOUN, NUM, PART from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index fcf807278..e4b53e5b7 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH _exc = {} diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py index bb8c0763b..dd8919b73 100644 --- a/spacy/lang/lv/__init__.py +++ b/spacy/lang/lv/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index 075ad6347..a9612f949 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-lv diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py index fd95f9354..eb52a3935 100644 --- a/spacy/lang/mr/__init__.py +++ b/spacy/lang/mr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 0b0cd035d..0d7501461 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json STOP_WORDS = set( diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 086761f82..3120951a2 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py index c15426ded..89e265951 100644 --- a/spacy/lang/nb/examples.py +++ b/spacy/lang/nb/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nb/morph_rules.py b/spacy/lang/nb/morph_rules.py index e20814535..b1799fca8 100644 --- a/spacy/lang/nb/morph_rules.py +++ b/spacy/lang/nb/morph_rules.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # This dict includes all the PRON and DET tag combinations found in the diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index b49aa9838..5d5800ae3 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index caa2012e7..fd65dd788 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ alle allerede alt and andre annen annet at av diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py index ca0ece265..a67586ed9 100644 --- a/spacy/lang/nb/tag_map.py +++ b/spacy/lang/nb/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X from ...symbols import VERB, NOUN, PROPN, PART, INTJ, PRON, AUX diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 92ac09841..ef6dcf264 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 074fd9133..c12b08d77 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py index a459760f4..fcefa9d62 100644 --- a/spacy/lang/nl/examples.py +++ b/spacy/lang/nl/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 9a92bee44..e7501ec52 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py index 69343b589..f1acaefeb 100644 --- a/spacy/lang/nl/lex_attrs.py +++ b/spacy/lang/nl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py index a48ecc044..3f3be61f8 100644 --- a/spacy/lang/nl/punctuation.py +++ b/spacy/lang/nl/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py index 44551f2d4..a2c6198e7 100644 --- a/spacy/lang/nl/stop_words.py +++ b/spacy/lang/nl/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # The original stop words list (added in f46ffe3) was taken from # http://www.damienvanholten.com/downloads/dutch-stop-words.txt # and consisted of about 100 tokens. diff --git a/spacy/lang/nl/tag_map.py b/spacy/lang/nl/tag_map.py index 4fde5d39f..5bd7747c6 100644 --- a/spacy/lang/nl/tag_map.py +++ b/spacy/lang/nl/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, SPACE, PRON, CONJ diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py index dbdd104f3..12ab8aef5 100644 --- a/spacy/lang/nl/tokenizer_exceptions.py +++ b/spacy/lang/nl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH # Extensive list of both common and uncommon dutch abbreviations copied from diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py index 341967a78..c194f05c7 100644 --- a/spacy/lang/norm_exceptions.py +++ b/spacy/lang/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 702a19063..a03ead1ff 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py index 839eccb83..965318442 100644 --- a/spacy/lang/pl/_tokenizer_exceptions_list.py +++ b/spacy/lang/pl/_tokenizer_exceptions_list.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - # The following list consists of: # - exceptions generated from polish_srx_rules [1] # (https://github.com/milekpl/polish_srx_rules) diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py index 14b6c7030..6eabe1843 100644 --- a/spacy/lang/pl/examples.py +++ b/spacy/lang/pl/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py index f1379aa50..ce56e28a8 100644 --- a/spacy/lang/pl/lex_attrs.py +++ b/spacy/lang/pl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py index 4e69a3912..eea28de11 100644 --- a/spacy/lang/pl/punctuation.py +++ b/spacy/lang/pl/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 11df67328..075aec391 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 - -from __future__ import unicode_literals - # sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl STOP_WORDS = set( diff --git a/spacy/lang/pl/tag_map.py b/spacy/lang/pl/tag_map.py index 5356c26cb..b83ee4d4c 100644 --- a/spacy/lang/pl/tag_map.py +++ b/spacy/lang/pl/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ( POS, ADJ, diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index 9e4814b0f..39f3017ed 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index f786d6542..0557e8b31 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py index b7206ffd7..7427f8b25 100644 --- a/spacy/lang/pt/examples.py +++ b/spacy/lang/pt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py index 4ad0eeecb..3c6979ab4 100644 --- a/spacy/lang/pt/lex_attrs.py +++ b/spacy/lang/pt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py index ea650cb31..e115b0385 100644 --- a/spacy/lang/pt/norm_exceptions.py +++ b/spacy/lang/pt/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py index 370e6aaad..08e31f9d0 100644 --- a/spacy/lang/pt/punctuation.py +++ b/spacy/lang/pt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 774b06809..8065fcda7 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/pt/tag_map.py b/spacy/lang/pt/tag_map.py index cdc7de57e..dc65998a4 100644 --- a/spacy/lang/pt/tag_map.py +++ b/spacy/lang/pt/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, CCONJ from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 5169780e6..2089ea8fa 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, NORM diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index ccb72de28..bf7357e48 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 6c325b74d..e32ae19cb 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py index a372d7cb2..d472f0d6d 100644 --- a/spacy/lang/ro/examples.py +++ b/spacy/lang/ro/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py index bb8391ad1..0f86f53cd 100644 --- a/spacy/lang/ro/lex_attrs.py +++ b/spacy/lang/ro/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index b5ba73458..1d90be85d 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-ro STOP_WORDS = set( """ diff --git a/spacy/lang/ro/tag_map.py b/spacy/lang/ro/tag_map.py index cb5239809..d6820b4f2 100644 --- a/spacy/lang/ro/tag_map.py +++ b/spacy/lang/ro/tag_map.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from ...symbols import POS, ADJ, ADP, ADV, INTJ, NOUN, NUM, PART from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X, CCONJ, SCONJ, DET, AUX diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py index a7fb38453..8408ef987 100644 --- a/spacy/lang/ro/tokenizer_exceptions.py +++ b/spacy/lang/ro/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index f34fc5435..d25e8048b 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py index 2db621dac..34cf5a1eb 100644 --- a/spacy/lang/ru/examples.py +++ b/spacy/lang/ru/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 96d32f59c..ed0e858f5 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer -from ...compat import unicode_ class RussianLemmatizer(Lemmatizer): @@ -85,7 +81,7 @@ class RussianLemmatizer(Lemmatizer): @staticmethod def normalize_univ_pos(univ_pos): - if isinstance(univ_pos, unicode_): + if isinstance(univ_pos, str): return univ_pos.upper() symbols_to_str = { diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 448c5b285..7979c7ea6 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py index 43e08948c..c5d725031 100644 --- a/spacy/lang/ru/norm_exceptions.py +++ b/spacy/lang/ru/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Slang diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 89069b3cf..16cb55ef9 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ а diff --git a/spacy/lang/ru/tag_map.py b/spacy/lang/ru/tag_map.py index baf065588..294919811 100644 --- a/spacy/lang/ru/tag_map.py +++ b/spacy/lang/ru/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index ea7b5b20d..df3169baf 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py index a58a63f03..3b065860c 100644 --- a/spacy/lang/si/__init__.py +++ b/spacy/lang/si/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py index 842dfdd7e..0ff00e76e 100644 --- a/spacy/lang/si/examples.py +++ b/spacy/lang/si/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/si/lex_attrs.py b/spacy/lang/si/lex_attrs.py index 5d5f06187..aa061852d 100644 --- a/spacy/lang/si/lex_attrs.py +++ b/spacy/lang/si/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index 8bbdec6b7..49723c860 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py index e7704196a..77a07e504 100644 --- a/spacy/lang/sk/__init__.py +++ b/spacy/lang/sk/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index f6994d33f..bd39b22f2 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-sk diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py index 2d4977bdf..ce46e92dc 100644 --- a/spacy/lang/sl/__init__.py +++ b/spacy/lang/sl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 187e95876..c8596ad0b 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-sl # TODO: probably needs to be tidied up – the list seems to have month names in diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py index 6f33b37c2..034604838 100644 --- a/spacy/lang/sq/__init__.py +++ b/spacy/lang/sq/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py index c51a0da39..e1075f70a 100644 --- a/spacy/lang/sq/examples.py +++ b/spacy/lang/sq/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index f91861ca1..58ee87d05 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/andrixh/index-albanian diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index f27b87102..151cc231c 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py index d636220c3..1ac867f4c 100644 --- a/spacy/lang/sr/examples.py +++ b/spacy/lang/sr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index c90dc0da7..dc48909bc 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py index 69f2c3173..add8350a0 100644 --- a/spacy/lang/sr/norm_exceptions.py +++ b/spacy/lang/sr/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Slang diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 9712327f8..488c82a75 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index 8fca346a3..82df15186 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 671eefca0..d400eae4d 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py index 58e095195..98eee700b 100644 --- a/spacy/lang/sv/examples.py +++ b/spacy/lang/sv/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sv/morph_rules.py b/spacy/lang/sv/morph_rules.py index 77744813f..8fca20a49 100644 --- a/spacy/lang/sv/morph_rules.py +++ b/spacy/lang/sv/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 206abce5a..4d933a76d 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 7a82e6b59..021d5d2f5 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/sv/tag_map.py b/spacy/lang/sv/tag_map.py index 7d4e29030..d4f5b6291 100644 --- a/spacy/lang/sv/tag_map.py +++ b/spacy/lang/sv/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index dd0976aa6..834a088ad 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG _exc = {} diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index cb23339e6..d7a04afea 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..2590163cb 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py index 40158ad7a..f830f4ac9 100644 --- a/spacy/lang/ta/lex_attrs.py +++ b/spacy/lang/ta/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py index fbdceb98c..8eaf0aa74 100644 --- a/spacy/lang/ta/norm_exceptions.py +++ b/spacy/lang/ta/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Regional words normal # Sri Lanka - wikipeadia diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index 91ebe8fd8..83410d65e 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words diff --git a/spacy/lang/tag_map.py b/spacy/lang/tag_map.py index 3a744f180..5bff905bd 100644 --- a/spacy/lang/tag_map.py +++ b/spacy/lang/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py index a4709177d..424164cc7 100644 --- a/spacy/lang/te/__init__.py +++ b/spacy/lang/te/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py index 815ec8227..6162b231e 100644 --- a/spacy/lang/te/examples.py +++ b/spacy/lang/te/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/te/lex_attrs.py b/spacy/lang/te/lex_attrs.py index 6da766dca..ae11827f6 100644 --- a/spacy/lang/te/lex_attrs.py +++ b/spacy/lang/te/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py index 11e157177..b18dab697 100644 --- a/spacy/lang/te/stop_words.py +++ b/spacy/lang/te/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Xangis/extra-stopwords (MIT License) STOP_WORDS = set( diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 06970fbd7..950a77818 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py index 047d046c2..bc4e5293e 100644 --- a/spacy/lang/th/lex_attrs.py +++ b/spacy/lang/th/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py index ed1b3e760..98b878308 100644 --- a/spacy/lang/th/norm_exceptions.py +++ b/spacy/lang/th/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 119a2f6a0..7fb12d538 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX, VERB from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py index 4de0f1195..0529b3a99 100644 --- a/spacy/lang/th/tokenizer_exceptions.py +++ b/spacy/lang/th/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 30ad93139..f477029f7 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py index 61dc9d4f3..60bdc923b 100644 --- a/spacy/lang/tl/lex_attrs.py +++ b/spacy/lang/tl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index 510b3a418..2560cdaed 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ akin diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py index 77e1fb0c6..ea14746c4 100644 --- a/spacy/lang/tl/tokenizer_exceptions.py +++ b/spacy/lang/tl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 3ea2bc3e9..13a1033a6 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from ..symbols import ORTH, POS, TAG, LEMMA, SPACE diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 2553e7c0f..a29d78261 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py index a0464dfe3..a14d87a46 100644 --- a/spacy/lang/tr/examples.py +++ b/spacy/lang/tr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py index 93f26fc8e..3dbc1833a 100644 --- a/spacy/lang/tr/lex_attrs.py +++ b/spacy/lang/tr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py index 65905499a..85dcff6a5 100644 --- a/spacy/lang/tr/stop_words.py +++ b/spacy/lang/tr/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-tr STOP_WORDS = set( """ diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py index f48e035d4..97f524a87 100644 --- a/spacy/lang/tr/tokenizer_exceptions.py +++ b/spacy/lang/tr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, NORM _exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]} diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index 3655e6264..80574a70d 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS diff --git a/spacy/lang/tt/examples.py b/spacy/lang/tt/examples.py index ac668a0c2..723fcdd15 100644 --- a/spacy/lang/tt/examples.py +++ b/spacy/lang/tt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.tt.examples import sentences diff --git a/spacy/lang/tt/lex_attrs.py b/spacy/lang/tt/lex_attrs.py index ad3d6b9eb..a2ae03061 100644 --- a/spacy/lang/tt/lex_attrs.py +++ b/spacy/lang/tt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py index 9ee66a59e..f644a8ccb 100644 --- a/spacy/lang/tt/punctuation.py +++ b/spacy/lang/tt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS from ..char_classes import LIST_ELLIPSES, LIST_ICONS diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py index 9f6e9bb86..44169b757 100644 --- a/spacy/lang/tt/stop_words.py +++ b/spacy/lang/tt/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Tatar stopwords are from https://github.com/aliiae/stopwords-tt STOP_WORDS = set( diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py index 89f7a990b..efe9e1fc0 100644 --- a/spacy/lang/tt/tokenizer_exceptions.py +++ b/spacy/lang/tt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM _exc = {} diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index e74ff2d86..51165112a 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py index 4f2b034eb..d17768ea6 100644 --- a/spacy/lang/uk/examples.py +++ b/spacy/lang/uk/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 3eeed5dd4..ff61d711f 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,4 +1,3 @@ -# coding: utf8 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer diff --git a/spacy/lang/uk/lex_attrs.py b/spacy/lang/uk/lex_attrs.py index 0ade751d6..510e5b85d 100644 --- a/spacy/lang/uk/lex_attrs.py +++ b/spacy/lang/uk/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index cdf24dd70..b11d7a044 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """а або diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py index 472e772ef..1ecbddc49 100644 --- a/spacy/lang/uk/tag_map.py +++ b/spacy/lang/uk/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index a94d77af3..36f0b2e72 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, POS, NORM, NOUN diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index 6eea0cf3b..c7f65adc3 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py index f47c11600..7024483b5 100644 --- a/spacy/lang/ur/examples.py +++ b/spacy/lang/ur/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py index 12d85be4b..e590ed3e3 100644 --- a/spacy/lang/ur/lex_attrs.py +++ b/spacy/lang/ur/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM # Source https://quizlet.com/4271889/1-100-urdu-number-wordsurdu-numerals-flash-cards/ diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py index b8b1a1c83..5d35d0a25 100644 --- a/spacy/lang/ur/punctuation.py +++ b/spacy/lang/ur/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py index 73c159d5c..abfa36497 100644 --- a/spacy/lang/ur/stop_words.py +++ b/spacy/lang/ur/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: collected from different resource on internet STOP_WORDS = set( """ diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index 2499d7e3e..e0940edb7 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 425f84e3d..7496763ee 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LANG, NORM from ..norm_exceptions import BASE_NORMS from ...language import Language diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py index b6cd1188a..b3dbf2192 100644 --- a/spacy/lang/vi/lex_attrs.py +++ b/spacy/lang/vi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py index 13284dc59..1d2ecdf8d 100644 --- a/spacy/lang/vi/stop_words.py +++ b/spacy/lang/vi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords/vietnamese-stopwords STOP_WORDS = set( """ diff --git a/spacy/lang/vi/tag_map.py b/spacy/lang/vi/tag_map.py index 472e772ef..1ecbddc49 100644 --- a/spacy/lang/vi/tag_map.py +++ b/spacy/lang/vi/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index 66d8c7917..2af650703 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 38cd5e0cd..15f5c4ff8 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py index f227203cc..08e3166e1 100644 --- a/spacy/lang/yo/__init__.py +++ b/spacy/lang/yo/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py index 170ddc803..9b875d09e 100644 --- a/spacy/lang/yo/examples.py +++ b/spacy/lang/yo/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py index a9f1b85f6..ead68ced2 100644 --- a/spacy/lang/yo/lex_attrs.py +++ b/spacy/lang/yo/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata from ...attrs import LIKE_NUM diff --git a/spacy/lang/yo/stop_words.py b/spacy/lang/yo/stop_words.py index 53d382ad3..5c7a7fc45 100644 --- a/spacy/lang/yo/stop_words.py +++ b/spacy/lang/yo/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # stop words as whitespace-separated list. # Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8179b4551..e427dc6d2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LANG from ...language import Language from ...tokens import Doc diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py index b28215741..d0715eb0d 100644 --- a/spacy/lang/zh/examples.py +++ b/spacy/lang/zh/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py index 0b29c226e..08c8e3160 100644 --- a/spacy/lang/zh/lex_attrs.py +++ b/spacy/lang/zh/lex_attrs.py @@ -1,8 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals import re + from ...attrs import LIKE_NUM + _single_num_words = [ "〇", "一", diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py index 0af4c1859..42ae4a1de 100644 --- a/spacy/lang/zh/stop_words.py +++ b/spacy/lang/zh/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # stop words as whitespace-separated list # Chinese stop words,maybe not enough STOP_WORDS = set( diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 41e2d2158..1ff0827be 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE diff --git a/spacy/language.py b/spacy/language.py index 008b5559f..4a553bcaf 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,14 +1,7 @@ -# coding: utf8 -from __future__ import absolute_import, unicode_literals - import random import itertools - -from spacy.gold import Example -from spacy.util import minibatch import weakref import functools -from collections import OrderedDict from contextlib import contextmanager from copy import copy, deepcopy from thinc.neural import Model @@ -21,8 +14,7 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs -from .compat import izip, basestring_, is_python2, class_types -from .gold import GoldParse +from .gold import Example from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG @@ -32,7 +24,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings, deprecation_warning from . import util from . import about @@ -190,7 +182,7 @@ class Language(object): self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", ">={}".format(about.__version__)) + self._meta.setdefault("spacy_version", f">={about.__version__}") self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") @@ -263,7 +255,7 @@ class Language(object): RETURNS (dict): Labels keyed by component name. """ - labels = OrderedDict() + labels = {} for name, pipe in self.pipeline: if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) @@ -320,7 +312,7 @@ class Language(object): """ if not hasattr(component, "__call__"): msg = Errors.E003.format(component=repr(component), name=name) - if isinstance(component, basestring_) and component in self.factories: + if isinstance(component, str) and component in self.factories: msg += Errors.E004.format(component=component) raise ValueError(msg) if name is None: @@ -372,7 +364,7 @@ class Language(object): raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) if not hasattr(component, "__call__"): msg = Errors.E003.format(component=repr(component), name=name) - if isinstance(component, basestring_) and component in self.factories: + if isinstance(component, str) and component in self.factories: msg += Errors.E135.format(name=name) raise ValueError(msg) self.pipeline[self.pipe_names.index(name)] = (name, component) @@ -476,6 +468,7 @@ class Language(object): sgd = self._optimizer grads = {} + def get_grads(W, dW, key=None): grads[key] = (W, dW) @@ -725,9 +718,6 @@ class Language(object): """ # raw_texts will be used later to stop iterator. texts, raw_texts = itertools.tee(texts) - if is_python2 and n_process != 1: - user_warning(Warnings.W023) - n_process = 1 if n_threads != -1: deprecation_warning(Warnings.W016) if n_process == -1: @@ -744,7 +734,7 @@ class Language(object): component_cfg=component_cfg, as_example=False ) - for doc, context in izip(docs, contexts): + for doc, context in zip(docs, contexts): yield (doc, context) return if component_cfg is None: @@ -814,7 +804,7 @@ class Language(object): *[mp.Pipe(False) for _ in range(n_process)] ) - batch_texts = minibatch(texts, batch_size) + batch_texts = util.minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) @@ -858,7 +848,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) - serializers = OrderedDict() + serializers = {} serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( p, exclude=["vocab"] ) @@ -891,7 +881,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) - deserializers = OrderedDict() + deserializers = {} deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk( p @@ -925,7 +915,7 @@ class Language(object): if disable is not None: deprecation_warning(Warnings.W014) exclude = disable - serializers = OrderedDict() + serializers = {} serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) @@ -950,7 +940,7 @@ class Language(object): if disable is not None: deprecation_warning(Warnings.W014) exclude = disable - deserializers = OrderedDict() + deserializers = {} deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes( b @@ -1009,7 +999,7 @@ class component(object): def factory(nlp, **cfg): if hasattr(obj, "from_nlp"): return obj.from_nlp(nlp, **cfg) - elif isinstance(obj, class_types): + elif isinstance(obj, type): return obj() return obj diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d70e4cfc4..3ba86c169 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,8 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import OrderedDict - from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups @@ -160,7 +155,7 @@ class Lemmatizer(object): else: oov_forms.append(form) # Remove duplicates but preserve the ordering of applied "rules" - forms = list(OrderedDict.fromkeys(forms)) + forms = list(dict.fromkeys(forms)) # Put exceptions at the front of the list, so they get priority. # This is a dodgy heuristic -- but it's the best we can do until we get # frequencies on this. We can at least prune out problematic exceptions, diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5c981bc25..497e20516 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,7 +1,4 @@ # cython: embedsignature=True -# coding: utf8 -from __future__ import unicode_literals, print_function - # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray from libc.string cimport memset diff --git a/spacy/lookups.py b/spacy/lookups.py index bf250b4b4..a9d371b79 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - import srsly -from collections import OrderedDict from preshed.bloom import BloomFilter +from collections import OrderedDict from .errors import Errors from .util import SimpleFrozenDict, ensure_path @@ -28,7 +25,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#init """ - self._tables = OrderedDict() + self._tables = {} def __contains__(self, name): """Check if the lookups contain a table of a given name. Delegates to @@ -118,7 +115,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#from_bytes """ - self._tables = OrderedDict() + self._tables = {} for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) self._tables[key].update(value) @@ -254,12 +251,12 @@ class Table(OrderedDict): DOCS: https://spacy.io/api/lookups#table.to_bytes """ - data = [ - ("name", self.name), - ("dict", dict(self.items())), - ("bloom", self.bloom.to_bytes()), - ] - return srsly.msgpack_dumps(OrderedDict(data)) + data = { + "name": self.name, + "dict": dict(self.items()), + "bloom": self.bloom.to_bytes(), + } + return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data): """Load a table from a bytestring. diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py index 91874ed43..286844787 100644 --- a/spacy/matcher/__init__.py +++ b/spacy/matcher/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .matcher import Matcher from .phrasematcher import PhraseMatcher from .dependencymatcher import DependencyMatcher diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 1b10f0dd5..ce6379c45 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - TOKEN_PATTERN_SCHEMA = { "$schema": "http://json-schema.org/draft-06/schema", diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 56d27024d..46cff0d0c 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 30ef3dd36..2908ab0c2 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from libcpp.vector cimport vector from libc.stdint cimport int32_t from cymem.cymem cimport Pool diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4de5782f9..20f45b9e4 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from libc.stdint cimport uintptr_t from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index 57e7ef571..8eebf0564 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1,5 +1,2 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tok2vec import Tok2Vec # noqa: F401 from .common import FeedForward, LayerNormalizedMaxout # noqa: F401 diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py index b077a46b7..e7baae380 100644 --- a/spacy/ml/_legacy_tok2vec.py +++ b/spacy/ml/_legacy_tok2vec.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from thinc.v2v import Model, Maxout from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py index fa271b37c..2b1144fcb 100644 --- a/spacy/ml/_wire.py +++ b/spacy/ml/_wire.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from thinc.api import layerize, wrap, noop, chain, concatenate from thinc.v2v import Model diff --git a/spacy/ml/common.py b/spacy/ml/common.py index f90b53a15..4ecb00e4e 100644 --- a/spacy/ml/common.py +++ b/spacy/ml/common.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from thinc.api import chain from thinc.v2v import Maxout from thinc.misc import LayerNorm diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 8f86475ef..9a0ed6bf5 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued from thinc.api import noop, with_square_sequences from thinc.v2v import Maxout, Model diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c146094a9..f12691170 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,12 +1,8 @@ # cython: infer_types -# coding: utf8 -from __future__ import unicode_literals - from libc.string cimport memset import srsly from collections import Counter -from .compat import basestring_ from .strings import get_string_id from . import symbols from .attrs cimport POS, IS_SPACE @@ -190,7 +186,7 @@ cdef class Morphology: present. Returns the hash of the new analysis. """ for f in features: - if isinstance(f, basestring_): + if isinstance(f, str): self.strings.add(f) string_features = features features = intify_features(features) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 3925a6738..e71fb917f 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - IDS = { "": NO_TAG, diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index de8403152..2f9824eda 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2db312d64..75120dfe6 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,12 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import defaultdict, OrderedDict +from collections import defaultdict import srsly from ..language import component from ..errors import Errors -from ..compat import basestring_ from ..util import ensure_path, to_disk, from_disk from ..tokens import Span from ..matcher import Matcher, PhraseMatcher @@ -201,7 +197,7 @@ class EntityRuler(object): self._ent_ids[key] = (ent_label, entry["id"]) pattern = entry["pattern"] - if isinstance(pattern, basestring_): + if isinstance(pattern, str): self.phrase_patterns[label].append(self.nlp(pattern)) elif isinstance(pattern, list): self.token_patterns[label].append(pattern) @@ -230,8 +226,8 @@ class EntityRuler(object): RETURNS (str): The ent_label joined with configured `ent_id_sep` """ - if isinstance(ent_id, basestring_): - label = "{}{}{}".format(label, self.ent_id_sep, ent_id) + if isinstance(ent_id, str): + label = f"{label}{self.ent_id_sep}{ent_id}" return label def from_bytes(self, patterns_bytes, **kwargs): @@ -264,15 +260,12 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#to_bytes """ - - serial = OrderedDict( - ( - ("overwrite", self.overwrite), - ("ent_id_sep", self.ent_id_sep), - ("phrase_matcher_attr", self.phrase_matcher_attr), - ("patterns", self.patterns), - ) - ) + serial = { + "overwrite": self.overwrite, + "ent_id_sep": self.ent_id_sep, + "phrase_matcher_attr": self.phrase_matcher_attr, + "patterns": self.patterns, + } return srsly.msgpack_dumps(serial) def from_disk(self, path, **kwargs): diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 69e638da2..6e9d4197c 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..language import component from ..matcher import Matcher from ..util import filter_spans diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index b61a34c0e..68385c5a9 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural._classes.difference import Siamese, CauchySimilarity diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index adcff9280..10038d410 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,4 @@ -from __future__ import unicode_literals -from collections import OrderedDict, defaultdict +from collections import defaultdict import numpy cimport numpy as np @@ -13,7 +12,6 @@ from .._ml import Tok2Vec, build_morphologizer_model from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import create_default_optimizer from ..errors import Errors, TempErrors -from ..compat import basestring_ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology @@ -32,7 +30,7 @@ class Morphologizer(Pipe): def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault('cnn_maxout_pieces', 2) self._class_map = self.vocab.morphology.create_class_map() diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b041e2441..ff88340cd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1,12 +1,8 @@ # cython: infer_types=True # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals - import numpy import srsly import random -from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm @@ -24,7 +20,6 @@ from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj from ..gold import Example -from ..compat import basestring_ from ..attrs import POS, ID from ..parts_of_speech import X from ..kb import KnowledgeBase @@ -183,7 +178,7 @@ class Pipe(object): exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized object. """ - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) if self.model not in (True, False, None): serialize["model"] = self.model.to_bytes @@ -206,7 +201,7 @@ class Pipe(object): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) if hasattr(self, "vocab"): deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) @@ -217,7 +212,7 @@ class Pipe(object): def to_disk(self, path, exclude=tuple(), **kwargs): """Serialize the pipe to disk.""" - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): @@ -239,7 +234,7 @@ class Pipe(object): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["model"] = load_model @@ -409,7 +404,7 @@ class Tagger(Pipe): self.vocab = vocab self.model = model self._rehearsal_model = None - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault("cnn_maxout_pieces", 2) @property @@ -564,7 +559,7 @@ class Tagger(Pipe): if not any(table in self.vocab.lookups for table in lemma_tables): user_warning(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) - new_tag_map = OrderedDict() + new_tag_map = {} for example in get_examples(): for tag in example.token_annotation.tags: if tag in orig_tag_map: @@ -594,7 +589,7 @@ class Tagger(Pipe): return build_tagger_model(n_tags, **cfg) def add_label(self, label, values=None): - if not isinstance(label, basestring_): + if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 @@ -624,12 +619,12 @@ class Tagger(Pipe): yield def to_bytes(self, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} if self.model not in (None, True, False): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) + tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) @@ -656,24 +651,24 @@ class Tagger(Pipe): lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - deserialize = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("tag_map", load_tag_map), - ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), - ("model", lambda b: load_model(b)), - )) + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "tag_map": load_tag_map, + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, exclude=tuple(), **kwargs): - tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) - serialize = OrderedDict(( - ("vocab", lambda p: self.vocab.to_disk(p)), - ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), - ("cfg", lambda p: srsly.write_json(p, self.cfg)) - )) + tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "tag_map": lambda p: srsly.write_msgpack(p, tag_map), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg) + } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -697,12 +692,12 @@ class Tagger(Pipe): lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - deserialize = OrderedDict(( - ("cfg", lambda p: self.cfg.update(_load_cfg(p))), - ("vocab", lambda p: self.vocab.from_disk(p)), - ("tag_map", load_tag_map), - ("model", load_model), - )) + deserialize = { + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "vocab": lambda p: self.vocab.from_disk(p), + "tag_map": load_tag_map, + "model": load_model, + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -719,7 +714,7 @@ class SentenceRecognizer(Tagger): self.vocab = vocab self.model = model self._rehearsal_model = None - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault("cnn_maxout_pieces", 2) self.cfg.setdefault("subword_features", True) self.cfg.setdefault("token_vector_width", 12) @@ -816,7 +811,7 @@ class SentenceRecognizer(Tagger): yield def to_bytes(self, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} if self.model not in (None, True, False): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes @@ -833,21 +828,21 @@ class SentenceRecognizer(Tagger): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), - ("model", lambda b: load_model(b)), - )) + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, exclude=tuple(), **kwargs): - serialize = OrderedDict(( - ("vocab", lambda p: self.vocab.to_disk(p)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), - ("cfg", lambda p: srsly.write_json(p, self.cfg)) - )) + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg) + } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -861,11 +856,11 @@ class SentenceRecognizer(Tagger): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict(( - ("cfg", lambda p: self.cfg.update(_load_cfg(p))), - ("vocab", lambda p: self.vocab.from_disk(p)), - ("model", load_model), - )) + deserialize = { + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "vocab": lambda p: self.vocab.from_disk(p), + "model": load_model, + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -1241,7 +1236,7 @@ class TextCategorizer(Pipe): return float(mean_square_error), d_scores def add_label(self, label): - if not isinstance(label, basestring_): + if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 @@ -1614,7 +1609,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk(self, path, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) @@ -1637,7 +1632,7 @@ class EntityLinker(Pipe): kb.load_bulk(p) self.set_kb(kb) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["kb"] = load_kb diff --git a/spacy/scorer.py b/spacy/scorer.py index 6238b6ead..82b10a77d 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import division, print_function, unicode_literals - import numpy as np from .gold import tags_to_entities, GoldParse, DocAnnotation diff --git a/spacy/strings.pyx b/spacy/strings.pyx index f3457e1a5..0605de96c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,7 +1,4 @@ # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals, absolute_import - cimport cython from libc.string cimport memcpy from libcpp.set cimport set @@ -9,7 +6,6 @@ from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 import srsly -from .compat import basestring_ from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT from .typedefs cimport hash_t @@ -24,7 +20,7 @@ def get_string_id(key): This function optimises for convenience over performance, so shouldn't be used in tight loops. """ - if not isinstance(key, basestring_): + if not isinstance(key, str): return key elif key in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[key] @@ -150,7 +146,7 @@ cdef class StringStore: return key else: return self[key] - + def add(self, string): """Add a string to the StringStore. diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index b65ae9628..85f23ccbc 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,8 +1,4 @@ -# coding: utf8 -#cython: optimize.unpack_method_calls=False -from __future__ import unicode_literals - - +# cython: optimize.unpack_method_calls=False IDS = { "": NIL, "IS_ALPHA": IS_ALPHA, diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 8b6448a46..19d05e77f 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -1,10 +1,6 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -# coding: utf-8 -from __future__ import unicode_literals, print_function - -from collections import OrderedDict import numpy cimport cython.parallel import numpy.random @@ -249,7 +245,7 @@ class ParserModel(Model): def resize_output(self, new_output): if len(self._layers) == 2: - return + return if new_output == self.upper.nO: return smaller = self.upper @@ -485,7 +481,7 @@ cdef class precompute_hiddens: ops = NumpyOps() else: ops = CupyOps() - + if self.activation == "maxout": state_vector, mask = ops.maxout(state_vector) else: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 45fd1170b..5ec169428 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,12 +1,9 @@ # cython: profile=True # cython: cdivision=True # cython: infer_types=True -# coding: utf-8 -from __future__ import unicode_literals - from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from collections import OrderedDict, defaultdict, Counter +from collections import defaultdict, Counter from thinc.extra.search cimport Beam import json @@ -25,7 +22,7 @@ from ..tokens.doc cimport Doc, set_children_from_heads # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 cdef weight_t MIN_SCORE = -90000 -cdef attr_t SUBTOK_LABEL = hash_string('subtok') +cdef attr_t SUBTOK_LABEL = hash_string(u'subtok') DEF NON_MONOTONIC = True DEF USE_BREAK = True diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 7467aa342..5dfa20b7d 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam -from collections import OrderedDict, Counter +from collections import Counter from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c98baf6fd..14d9e54d4 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,10 +1,6 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -# coding: utf-8 -from __future__ import unicode_literals, print_function - -from collections import OrderedDict import numpy cimport cython.parallel import numpy.random @@ -692,22 +688,22 @@ cdef class Parser: return self def to_bytes(self, exclude=tuple(), **kwargs): - serializers = OrderedDict(( - ('model', lambda: (self.model.to_bytes() if self.model is not True else True)), - ('vocab', lambda: self.vocab.to_bytes()), - ('moves', lambda: self.moves.to_bytes(exclude=["strings"])), - ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)) - )) + serializers = { + "model": lambda: (self.model.to_bytes() if self.model is not True else True), + "vocab": lambda: self.vocab.to_bytes(), + "moves": lambda: self.moves.to_bytes(exclude=["strings"]), + "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) + } exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): - deserializers = OrderedDict(( - ('vocab', lambda b: self.vocab.from_bytes(b)), - ('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])), - ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), - ('model', lambda b: None) - )) + deserializers = { + "vocab": lambda b: self.vocab.from_bytes(b), + "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: None + } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 2ec6b61ac..0f738f99f 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,12 +1,9 @@ -# coding: utf-8 # cython: profile=True # cython: infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ -from __future__ import unicode_literals - from copy import copy from spacy.gold import Example diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 2a15a2de1..47b37946c 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,7 +1,4 @@ -# coding: utf-8 # cython: infer_types=True -from __future__ import unicode_literals - import numpy from ..tokens.doc cimport Doc diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 65097f114..62e369091 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,12 +1,9 @@ # cython: infer_types=True -# coding: utf-8 -from __future__ import unicode_literals - from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam -from collections import OrderedDict, Counter +from collections import Counter import srsly from . cimport _beam_utils diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 816970e61..ba7b67e25 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 6c69e699a..766dcb739 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.pipeline import EntityRecognizer from spacy.tokens import Span import pytest diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 7b513cfab..6be6e3867 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 120fb6e28..d986d160c 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokens import Doc diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 86c7fbf72..41a060b7b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 5d570af53..67ebc06d6 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py index 2b6970a38..28cb66714 100644 --- a/spacy/tests/doc/test_pickle_doc.py +++ b/spacy/tests/doc/test_pickle_doc.py @@ -1,8 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.language import Language -from spacy.compat import pickle, unicode_ +from spacy.compat import pickle def test_pickle_single_doc(): @@ -16,9 +13,9 @@ def test_pickle_single_doc(): def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): - _ = nlp.vocab[unicode_(i)] # noqa: F841 + _ = nlp.vocab[str(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) - docs = list(nlp.pipe(unicode_(i) for i in range(100))) + docs = list(nlp.pipe(str(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 5bdf78f39..c82c04eeb 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import LEMMA from spacy.vocab import Vocab diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index d074fddc6..33b6fbe81 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokens import Doc, Token diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 01bb93c50..9fb552d44 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index a063a6569..18243c306 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.cli._schemas import TRAINING_SCHEMA from spacy.util import get_json_validator, validate_json diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index bff2a95c6..cff1d3327 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 2877bfeea..352460581 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from mock import Mock from spacy.tokens import Doc, Span, Token diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py index 3cfc380d2..125220caf 100644 --- a/spacy/tests/lang/ar/test_exceptions.py +++ b/spacy/tests/lang/ar/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py index 109c3721a..f4a8cc1e3 100644 --- a/spacy/tests/lang/ar/test_text.py +++ b/spacy/tests/lang/ar/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - def test_ar_tokenizer_handles_long_text(ar_tokenizer): text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. diff --git a/spacy/tests/lang/bn/test_tokenizer.py b/spacy/tests/lang/bn/test_tokenizer.py index 62dd52778..5b18c5269 100644 --- a/spacy/tests/lang/bn/test_tokenizer.py +++ b/spacy/tests/lang/bn/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py index 56156c328..71098f094 100644 --- a/spacy/tests/lang/ca/test_exception.py +++ b/spacy/tests/lang/ca/test_exception.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 4583a62b9..83a75f056 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 1506016d4..38f5fc708 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -1,10 +1,4 @@ -# coding: utf-8 - """Test that longer and mixed texts are tokenized correctly.""" - - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index a522ab5e8..603378ea7 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_prefix_suffix_infix.py b/spacy/tests/lang/da/test_prefix_suffix_infix.py index 8b43bf360..e36b3cdb9 100644 --- a/spacy/tests/lang/da/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/da/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py index 07b134e2d..3c6cca5ac 100644 --- a/spacy/tests/lang/da/test_text.py +++ b/spacy/tests/lang/da/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.da.lex_attrs import like_num diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index 2e065870e..a4614f6c4 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py index 5c8694da3..c897dcf2f 100644 --- a/spacy/tests/lang/de/test_parser.py +++ b/spacy/tests/lang/de/test_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py index 13e109395..82bd8ed69 100644 --- a/spacy/tests/lang/de/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index b3fb1eaa5..22711763e 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_exception.py b/spacy/tests/lang/el/test_exception.py index b8d10fb69..a4656ea98 100644 --- a/spacy/tests/lang/el/test_exception.py +++ b/spacy/tests/lang/el/test_exception.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_text.py b/spacy/tests/lang/el/test_text.py index a6395ab4a..1b3ef6182 100644 --- a/spacy/tests/lang/el/test_text.py +++ b/spacy/tests/lang/el/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index 7f939011f..f5302cb31 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.lang.en import English diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 6285a9408..b2e941dab 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py index 8a7bc0323..d50c75fc5 100644 --- a/spacy/tests/lang/en/test_indices.py +++ b/spacy/tests/lang/en/test_indices.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - def test_en_simple_punct(en_tokenizer): text = "to walk, do foo" diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 7dc47f9cc..6739b5137 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import numpy from spacy.attrs import HEAD, DEP from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py index ce696bc25..057143696 100644 --- a/spacy/tests/lang/en/test_parser.py +++ b/spacy/tests/lang/en/test_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 3dccd6bcf..8c9c58fea 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py index 61274cf14..4dc6ddfe4 100644 --- a/spacy/tests/lang/en/test_punct.py +++ b/spacy/tests/lang/en/test_punct.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import compile_prefix_regex from spacy.lang.punctuation import TOKENIZER_PREFIXES diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 40bd110e8..ba7b2f2cf 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc, apply_transition_sequence diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py index 567fd5a44..d9eced2ff 100644 --- a/spacy/tests/lang/en/test_tagger.py +++ b/spacy/tests/lang/en/test_tagger.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index a7ebde989..c5d56d885 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en.lex_attrs import like_num diff --git a/spacy/tests/lang/es/test_exception.py b/spacy/tests/lang/es/test_exception.py index 8d6164058..90d897a4c 100644 --- a/spacy/tests/lang/es/test_exception.py +++ b/spacy/tests/lang/es/test_exception.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index acd572b48..af7b0212d 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fi/test_text.py b/spacy/tests/lang/fi/test_text.py index 2dd92597e..dbb67ad7a 100644 --- a/spacy/tests/lang/fi/test_text.py +++ b/spacy/tests/lang/fi/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 17f6f0ccc..6d5a14e6e 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 93dbf0993..98d318f6e 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py index ca6bdbd87..01d50b0a6 100644 --- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.lang.punctuation import TOKENIZER_INFIXES diff --git a/spacy/tests/lang/fr/test_text.py b/spacy/tests/lang/fr/test_text.py index 24b4c4532..01231f593 100644 --- a/spacy/tests/lang/fr/test_text.py +++ b/spacy/tests/lang/fr/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.fr.lex_attrs import like_num diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 29bc1c759..78127ef7c 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/he/test_tokenizer.py b/spacy/tests/lang/he/test_tokenizer.py index f138ec6e7..3131014a3 100644 --- a/spacy/tests/lang/he/test_tokenizer.py +++ b/spacy/tests/lang/he/test_tokenizer.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index fa8e132c0..4ec720c60 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_prefix_suffix_infix.py b/spacy/tests/lang/id/test_prefix_suffix_infix.py index e86a98ee3..2a81dab01 100644 --- a/spacy/tests/lang/id/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_text.py b/spacy/tests/lang/id/test_text.py index 915d268ae..ed6487b68 100644 --- a/spacy/tests/lang/id/test_text.py +++ b/spacy/tests/lang/id/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.id.lex_attrs import like_num diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py index f84351fd7..46f66b5e6 100644 --- a/spacy/tests/lang/it/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..4cb3110b3 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..481f346bb 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 42c306c11..7782ca4bc 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index b8fe7959c..eac309857 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest # fmt: off diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 7ca2394b7..5b5005ae7 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lb/test_prefix_suffix_infix.py b/spacy/tests/lang/lb/test_prefix_suffix_infix.py index d85f932be..3958d1543 100644 --- a/spacy/tests/lang/lb/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/lb/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py index 36464b379..b0ba76b6b 100644 --- a/spacy/tests/lang/lb/test_text.py +++ b/spacy/tests/lang/lb/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index cac32aa4d..8d9201cd9 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nb/test_tokenizer.py b/spacy/tests/lang/nb/test_tokenizer.py index f72d310e8..2da6e8d40 100644 --- a/spacy/tests/lang/nb/test_tokenizer.py +++ b/spacy/tests/lang/nb/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nl/test_text.py b/spacy/tests/lang/nl/test_text.py index 4045b1c39..8bc72cc6d 100644 --- a/spacy/tests/lang/nl/test_text.py +++ b/spacy/tests/lang/nl/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.nl.lex_attrs import like_num diff --git a/spacy/tests/lang/pl/test_text.py b/spacy/tests/lang/pl/test_text.py index ec9b18084..e8654a498 100644 --- a/spacy/tests/lang/pl/test_text.py +++ b/spacy/tests/lang/pl/test_text.py @@ -1,9 +1,4 @@ -# coding: utf-8 """Words like numbers are recognized correctly.""" - - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py index 9d0034589..a04b4fdcb 100644 --- a/spacy/tests/lang/pl/test_tokenizer.py +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest DOT_TESTS = [ diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py index 39dfff2c1..3a9162b80 100644 --- a/spacy/tests/lang/pt/test_text.py +++ b/spacy/tests/lang/pt/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.pt.lex_attrs import like_num diff --git a/spacy/tests/lang/ro/test_tokenizer.py b/spacy/tests/lang/ro/test_tokenizer.py index a327174e5..64c072470 100644 --- a/spacy/tests/lang/ro/test_tokenizer.py +++ b/spacy/tests/lang/ro/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ru/test_exceptions.py b/spacy/tests/lang/ru/test_exceptions.py index a8f0c3429..4fb417df8 100644 --- a/spacy/tests/lang/ru/test_exceptions.py +++ b/spacy/tests/lang/ru/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index b228fded8..40dcf4cf8 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc diff --git a/spacy/tests/lang/ru/test_text.py b/spacy/tests/lang/ru/test_text.py index c5bff6973..b0eaf66bb 100644 --- a/spacy/tests/lang/ru/test_text.py +++ b/spacy/tests/lang/ru/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.ru.lex_attrs import like_num diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py index 5507f9f09..e05a479aa 100644 --- a/spacy/tests/lang/ru/test_tokenizer.py +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index 285e99996..fa92e5e2d 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py index c4672b3ef..03a0470bd 100644 --- a/spacy/tests/lang/sr/test_tokenizer.py +++ b/spacy/tests/lang/sr/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index c977a4183..5d3acf3d5 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index ac7c066ba..ad335c317 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index f3fdd9a9e..bbb0ff415 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py index 9ea1851ae..dc4911ab6 100644 --- a/spacy/tests/lang/sv/test_text.py +++ b/spacy/tests/lang/sv/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - def test_sv_tokenizer_handles_long_text(sv_tokenizer): text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py index 894b5aa6a..8871f4414 100644 --- a/spacy/tests/lang/sv/test_tokenizer.py +++ b/spacy/tests/lang/sv/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 4bb5aac70..b39109455 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 5c701fc22..de1871e64 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class diff --git a/spacy/tests/lang/th/test_tokenizer.py b/spacy/tests/lang/th/test_tokenizer.py index 265c7753d..1e1ba52dc 100644 --- a/spacy/tests/lang/th/test_tokenizer.py +++ b/spacy/tests/lang/th/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/tt/test_tokenizer.py b/spacy/tests/lang/tt/test_tokenizer.py index 66ef9c181..7e0748931 100644 --- a/spacy/tests/lang/tt/test_tokenizer.py +++ b/spacy/tests/lang/tt/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py index f744b32b0..eb647a041 100644 --- a/spacy/tests/lang/uk/test_tokenizer.py +++ b/spacy/tests/lang/uk/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/uk/test_tokenizer_exc.py b/spacy/tests/lang/uk/test_tokenizer_exc.py index 328e1d287..4fb4a6b31 100644 --- a/spacy/tests/lang/uk/test_tokenizer_exc.py +++ b/spacy/tests/lang/uk/test_tokenizer_exc.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ur/test_prefix_suffix_infix.py b/spacy/tests/lang/ur/test_prefix_suffix_infix.py index de11c9b34..e9f3272f4 100644 --- a/spacy/tests/lang/ur/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ur/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py index 546e79182..5da831cf8 100644 --- a/spacy/tests/lang/ur/test_text.py +++ b/spacy/tests/lang/ur/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py index ce6408b67..48b689f3d 100644 --- a/spacy/tests/lang/yo/test_text.py +++ b/spacy/tests/lang/yo/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.yo.lex_attrs import like_num diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py index 235f597a5..d48feaee5 100644 --- a/spacy/tests/lang/zh/test_text.py +++ b/spacy/tests/lang/zh/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 36d94beb5..f71785337 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e4584d03a..adeef834d 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from mock import Mock diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 240ace537..a6a82f2e2 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 2db2f9eb3..c879cc0fe 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.matcher import Matcher from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7a6585e06..23cd80d1d 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from mock import Mock from spacy.matcher import PhraseMatcher diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 41f807143..4cf6b1206 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.morphology import Morphology from spacy.strings import StringStore, get_string_id diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index bee9db82e..a24fd143d 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 0d9bd1ad0..dd593f7d3 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.pipeline import DependencyParser diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8329391ca..8d5043487 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 468b3ff40..0906fbb94 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy._ml import Tok2Vec from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 9dca99255..24997e47c 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 8bf8111c1..86d9a0180 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc from spacy.syntax.nonproj import is_nonproj_tree diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index fb5301718..75091ec07 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ..util import get_doc, apply_transition_sequence diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index eb206458e..ed95718f1 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ..util import get_doc diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index d935494d6..ed6aef096 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 945173faf..59ae4e629 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens.doc import Doc diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 198f11bcd..5c246538c 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,11 +1,7 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy.language from spacy.language import Language, component from spacy.analysis import print_summary, validate_attrs from spacy.analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.compat import is_python2 from mock import Mock, ANY import pytest @@ -17,8 +13,7 @@ def test_component_decorator_function(): return doc assert test_component.name == "test" - if not is_python2: - assert test_component.__doc__ == "docstring" + assert test_component.__doc__ == "docstring" assert test_component("foo") == "foo" @@ -45,13 +40,12 @@ def test_component_decorator_class(): assert test_component("foo") == "foo" assert hasattr(test_component, "custom") assert test_component.custom("bar") == "bar" - if not is_python2: - assert TestComponent.__doc__ == "docstring1" - assert TestComponent.__call__.__doc__ == "docstring2" - assert TestComponent.custom.__doc__ == "docstring3" - assert test_component.__doc__ == "docstring1" - assert test_component.__call__.__doc__ == "docstring2" - assert test_component.custom.__doc__ == "docstring3" + assert TestComponent.__doc__ == "docstring1" + assert TestComponent.__call__.__doc__ == "docstring2" + assert TestComponent.custom.__doc__ == "docstring3" + assert test_component.__doc__ == "docstring1" + assert test_component.__call__.__doc__ == "docstring2" + assert test_component.custom.__doc__ == "docstring3" def test_component_decorator_assigns(): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 8023f72a6..9ff5f8194 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.kb import KnowledgeBase diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 660ad3b28..210a56cea 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Span from spacy.language import Language diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py index 5efcc319a..0a9a4d3c9 100644 --- a/spacy/tests/pipeline/test_factories.py +++ b/spacy/tests/pipeline/test_factories.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.tokens import Span diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py index 5b5fcd2fd..ca983267f 100644 --- a/spacy/tests/pipeline/test_functions.py +++ b/spacy/tests/pipeline/test_functions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.pipeline.functions import merge_subtokens from ..util import get_doc diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 27fb57b18..3ec8b508d 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 359552c5b..78ab6d2d1 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import spacy from spacy.pipeline import Sentencizer diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index a5bda9090..ca9dab009 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 44834c2a8..9e37e92e1 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import random import numpy.random diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 61d2c9cd2..a3148aa90 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import random from spacy.matcher import Matcher diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 924c5aa3e..7d81c3148 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 4b27901ad..d9e1d663a 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import gc import numpy diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index e95c1a9b9..2c25b6d73 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 0acb25e90..49e7de179 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy import displacy from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index d05759c31..cc893e472 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English from spacy.lang.de import German @@ -9,11 +6,10 @@ from spacy.matcher import Matcher, PhraseMatcher from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.attrs import ENT_IOB, ENT_TYPE -from spacy.compat import pickle, is_python2, unescape_unicode +from spacy.compat import pickle from spacy import displacy from spacy.util import decaying import numpy -import re from spacy.vectors import Vectors from ..util import get_doc @@ -211,73 +207,6 @@ def test_issue3345(): assert ner.moves.is_valid(state, "B-GPE") -if is_python2: - # If we have this test in Python 3, pytest chokes, as it can't print the - # string above in the xpass message. - prefix_search = ( - b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" - b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" - b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" - b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" - b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" - b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" - b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" - b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" - b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" - b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" - b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" - b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" - b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" - b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" - b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" - b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" - b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" - b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" - b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" - b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" - b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" - b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" - b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" - b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" - b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" - b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" - b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" - b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" - b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" - b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" - b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" - b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" - b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" - b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" - b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" - b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" - b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" - b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" - b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" - b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" - b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" - b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" - b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" - b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" - b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" - b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" - b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" - b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" - b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" - b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" - b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" - b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" - b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" - b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" - b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" - b"\\U0001FA60-\\U0001FA6D]" - ) - - def test_issue3356(): - pattern = re.compile(unescape_unicode(prefix_search.decode("utf8"))) - assert not pattern.search("hello") - - def test_issue3410(): texts = ["Hello world", "This is a test"] nlp = English() diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 35731ac12..3d8ee9922 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py index c6f513730..aa77028fb 100644 --- a/spacy/tests/regression/test_issue3526.py +++ b/spacy/tests/regression/test_issue3526.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Span from spacy.language import Language diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py index 7b9d0bd2a..4c65a5bfe 100644 --- a/spacy/tests/regression/test_issue3531.py +++ b/spacy/tests/regression/test_issue3531.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy import displacy diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py index 19d89c797..be9e04b0b 100644 --- a/spacy/tests/regression/test_issue3540.py +++ b/spacy/tests/regression/test_issue3540.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import Doc import numpy as np diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py index 587b3a857..b3af59c2e 100644 --- a/spacy/tests/regression/test_issue3549.py +++ b/spacy/tests/regression/test_issue3549.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.matcher import Matcher from spacy.errors import MatchPatternError diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py index 8444f11f2..de047bcbc 100644 --- a/spacy/tests/regression/test_issue3555.py +++ b/spacy/tests/regression/test_issue3555.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc, Token from spacy.matcher import Matcher diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index bc8603888..367961ab1 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.util import minibatch, compounding diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py index d935db17f..51561b3ac 100644 --- a/spacy/tests/regression/test_issue3625.py +++ b/spacy/tests/regression/test_issue3625.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.hi import Hindi diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py index 37d15a5cf..ab5250edf 100644 --- a/spacy/tests/regression/test_issue3803.py +++ b/spacy/tests/regression/test_issue3803.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.es import Spanish diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py index fe722a681..27b1f5f29 100644 --- a/spacy/tests/regression/test_issue3839.py +++ b/spacy/tests/regression/test_issue3839.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index 62e8eabd6..0a851e869 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.attrs import IS_ALPHA from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py index 5cd245231..8500c09aa 100644 --- a/spacy/tests/regression/test_issue3879.py +++ b/spacy/tests/regression/test_issue3879.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index c060473f5..6e8ab6f43 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English import pytest diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py index 1b2dcea25..fa616db1d 100644 --- a/spacy/tests/regression/test_issue3882.py +++ b/spacy/tests/regression/test_issue3882.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.displacy import parse_deps from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py index 33230112f..6e4c9eeaa 100644 --- a/spacy/tests/regression/test_issue3951.py +++ b/spacy/tests/regression/test_issue3951.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py index c1f7fe100..7db28a31f 100644 --- a/spacy/tests/regression/test_issue3959.py +++ b/spacy/tests/regression/test_issue3959.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from ..util import make_tempdir diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py index ae60fa0fa..971c9b08e 100644 --- a/spacy/tests/regression/test_issue3962.py +++ b/spacy/tests/regression/test_issue3962.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from ..util import get_doc diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py index 22b8d486e..fe5388950 100644 --- a/spacy/tests/regression/test_issue3972.py +++ b/spacy/tests/regression/test_issue3972.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py index d075128aa..3ac26d3ab 100644 --- a/spacy/tests/regression/test_issue4002.py +++ b/spacy/tests/regression/test_issue4002.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index e774feb2d..7153594db 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.util import minibatch, compounding diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 00a8882d3..6644a8eda 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py index cc84cebf8..c52ded395 100644 --- a/spacy/tests/regression/test_issue4054.py +++ b/spacy/tests/regression/test_issue4054.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.vocab import Vocab import spacy from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py index d288f46c4..4849aa238 100644 --- a/spacy/tests/regression/test_issue4120.py +++ b/spacy/tests/regression/test_issue4120.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py index 93262f8cf..a726806d7 100644 --- a/spacy/tests/regression/test_issue4133.py +++ b/spacy/tests/regression/test_issue4133.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py index eb4eb8648..97d532d2a 100644 --- a/spacy/tests/regression/test_issue4190.py +++ b/spacy/tests/regression/test_issue4190.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokenizer import Tokenizer from spacy import util diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py index ef871bf9f..891f03b30 100644 --- a/spacy/tests/regression/test_issue4267.py +++ b/spacy/tests/regression/test_issue4267.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py index c57704d71..4bac97a44 100644 --- a/spacy/tests/regression/test_issue4272.py +++ b/spacy/tests/regression/test_issue4272.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.el import Greek diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py index cb09340ff..ffbc41226 100644 --- a/spacy/tests/regression/test_issue4278.py +++ b/spacy/tests/regression/test_issue4278.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.pipeline import Pipe diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index c68f745a7..a3f6f69df 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from collections import defaultdict from spacy.pipeline import EntityRecognizer diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index 484d5d280..4978e0c8e 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.util import minibatch, compounding import pytest diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py index ab6192744..917847a05 100644 --- a/spacy/tests/regression/test_issue4367.py +++ b/spacy/tests/regression/test_issue4367.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import DocBin diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py index 57d7547da..dbde1624e 100644 --- a/spacy/tests/regression/test_issue4373.py +++ b/spacy/tests/regression/test_issue4373.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher, PhraseMatcher from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 89332ca2f..80d37b1e6 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import srsly from spacy.gold import GoldCorpus from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py index 460449003..6f96c9f2d 100644 --- a/spacy/tests/regression/test_issue4528.py +++ b/spacy/tests/regression/test_issue4528.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import Doc, DocBin diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py index 381957be6..fa962c053 100644 --- a/spacy/tests/regression/test_issue4529.py +++ b/spacy/tests/regression/test_issue4529.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.gold import GoldParse diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 8ec9a0bd1..74bb5de10 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from mock import Mock from spacy.matcher import DependencyMatcher from ..util import get_doc diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py index eb49f4a38..3f6c1a57c 100644 --- a/spacy/tests/regression/test_issue4651.py +++ b/spacy/tests/regression/test_issue4651.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 8fa4f9259..149e1431b 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.kb import KnowledgeBase from spacy.util import ensure_path diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py index e710881d7..d9798ef84 100644 --- a/spacy/tests/regression/test_issue4707.py +++ b/spacy/tests/regression/test_issue4707.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.util import load_model_from_path from spacy.lang.en import English diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index ef2b1ee89..615bb1cd9 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,13 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import spacy - import pytest - from spacy.lang.en import English from spacy.tokens import Doc, DocBin -from spacy.compat import path2str from ..util import make_tempdir @@ -43,7 +37,7 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: file_path = d / "doc" - file_path = path2str(file_path) + file_path = str(file_path) doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py index 1881b7d0c..b8a31ab5e 100644 --- a/spacy/tests/serialize/test_serialize_extension_attrs.py +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.vocab import Vocab @@ -10,9 +7,7 @@ from spacy.vocab import Vocab def doc_w_attrs(en_tokenizer): Doc.set_extension("_test_attr", default=False) Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text)) - Doc.set_extension( - "_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg) - ) + Doc.set_extension("_test_method", method=lambda doc, arg: f"{len(doc.text)}{arg}") doc = en_tokenizer("This is a test.") doc._._test_attr = "test" return doc @@ -24,4 +19,4 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs): assert doc._.has("_test_attr") assert doc._._test_attr == "test" assert doc._._test_prop == len(doc.text) - assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test") + assert doc._._test_method("test") == f"{len(doc.text)}test" diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index b19c11864..91036a496 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.util import ensure_path from spacy.kb import KnowledgeBase diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index efc5d181c..4089a0d07 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.language import Language diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 797fa95f8..0ad9bc4d4 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 9a273980c..f504ed048 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class from spacy.tokenizer import Tokenizer diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 1671845ee..359a0657f 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.strings import StringStore diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index 77f1af020..ad56e4c54 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy import registry from thinc.v2v import Affine diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3b75e760a..b4aebe521 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 2d1f1bd8f..4436b437f 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy import displacy from spacy.displacy.render import DependencyRenderer @@ -80,10 +77,10 @@ def test_displacy_rtl(): html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html assert 'direction="rtl"' in html - assert 'lang="{}"'.format(nlp.lang) in html + assert f'lang="{nlp.lang}"' in html html = displacy.render(doc, page=True, style="ent") assert "direction: rtl" in html - assert 'lang="{}"'.format(nlp.lang) in html + assert f'lang="{nlp.lang}"' in html def test_displacy_render_wrapper(en_vocab): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 9d644d062..46c54b879 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,9 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import spacy from spacy.errors import AlignmentError -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation +from spacy.gold import ( + biluo_tags_from_offsets, + offsets_from_biluo_tags, + Example, + DocAnnotation, +) from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English @@ -14,14 +15,37 @@ from .util import make_tempdir import pytest import srsly + @pytest.fixture def doc(): text = "Sarah's sister flew to Silicon Valley via London." - tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] # head of '.' is intentionally nonprojective for testing heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] - deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] - lemmas = ['Sarah', "'s", 'sister', 'fly', 'to', 'Silicon', 'Valley', 'via', 'London', '.'] + deps = [ + "poss", + "case", + "nsubj", + "ROOT", + "prep", + "compound", + "pobj", + "prep", + "pobj", + "punct", + ] + lemmas = [ + "Sarah", + "'s", + "sister", + "fly", + "to", + "Silicon", + "Valley", + "via", + "London", + ".", + ] biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() @@ -45,7 +69,7 @@ def merged_dict(): "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], - } + } def test_gold_biluo_U(en_vocab): @@ -141,7 +165,9 @@ def test_roundtrip_docs_to_json(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] lemmas = [t.lemma_ for t in doc] - biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]) + biluo_tags = iob_to_biluo( + [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc] + ) cats = doc.cats # roundtrip to JSON @@ -214,7 +240,6 @@ def test_roundtrip_docs_to_json(doc): def test_projective_train_vs_nonprojective_dev(doc): nlp = English() - text = doc.text deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] @@ -244,9 +269,6 @@ def test_projective_train_vs_nonprojective_dev(doc): def test_ignore_misaligned(doc): nlp = English() text = doc.text - deps = [t.dep_ for t in doc] - heads = [t.head.i for t in doc] - with make_tempdir() as tmpdir: jsonl_file = tmpdir / "test.jsonl" data = [docs_to_json(doc)] @@ -268,17 +290,12 @@ def test_ignore_misaligned(doc): # doesn't raise an AlignmentError, but there is nothing to iterate over # because the only example can't be aligned - train_reloaded_example = list(goldcorpus.train_dataset(nlp, - ignore_misaligned=True)) + train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) assert len(train_reloaded_example) == 0 def test_make_orth_variants(doc): nlp = English() - text = doc.text - deps = [t.dep_ for t in doc] - heads = [t.head.i for t in doc] - with make_tempdir() as tmpdir: jsonl_file = tmpdir / "test.jsonl" # write to JSONL train dicts @@ -286,9 +303,8 @@ def test_make_orth_variants(doc): goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, - orth_variant_level=0.2)) - train_goldparse = train_reloaded_example.gold + train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) + train_goldparse = train_reloaded_example.gold # noqa: F841 @pytest.mark.parametrize( diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py index 89e797c1a..1330d3a65 100644 --- a/spacy/tests/test_json_schemas.py +++ b/spacy/tests/test_json_schemas.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.util import get_json_validator, validate_json, validate_schema from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 102b87142..58db0a040 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,10 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools - import pytest -from spacy.compat import is_python2 from spacy.gold import GoldParse from spacy.language import Language from spacy.tokens import Doc, Span @@ -134,9 +129,6 @@ def test_language_pipe(nlp2, n_process, texts): assert_docs_equal(doc, expected_doc) -@pytest.mark.skipif( - is_python2, reason="python2 seems to be unable to handle iterator properly" -) @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_stream(nlp2, n_process, texts): # check if nlp.pipe can handle infinite length iterator properly. diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 701222afc..c2534ca22 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.language import Language diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4075ccf64..09e0fb561 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,13 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import os import ctypes from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.compat import symlink_to, symlink_remove, path2str, is_windows +from spacy.compat import symlink_to, symlink_remove, is_windows from spacy._ml import PrecomputableAffine from subprocess import CalledProcessError @@ -25,7 +22,7 @@ def symlink(): @pytest.fixture(scope="function") def symlink_setup_target(request, symlink_target, symlink): if not symlink_target.exists(): - os.mkdir(path2str(symlink_target)) + os.mkdir(str(symlink_target)) # yield -- need to cleanup even if assertion fails # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 @@ -33,7 +30,7 @@ def symlink_setup_target(request, symlink_target, symlink): # Remove symlink only if it was created if symlink.exists(): symlink_remove(symlink) - os.rmdir(path2str(symlink_target)) + os.rmdir(str(symlink_target)) request.addfinalizer(cleanup) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 65288527a..e4c67b672 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy import srsly diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 888028b6c..efaf80b4f 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index ddaa71059..473d5017d 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,12 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy._ml import Tok2Vec from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.compat import unicode_ def get_batch(batch_size): @@ -16,7 +12,7 @@ def get_batch(batch_size): for size in range(1, batch_size + 1): # Make the words numbers, so that they're distnct # across the batch, and easy to track. - numbers = [unicode_(i) for i in range(start, start + size)] + numbers = [str(i) for i in range(start, start + size)] docs.append(Doc(vocab, words=numbers)) start += size return docs diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index c2011487e..8276d7aea 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import sys import pytest diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 2d71588cc..3e7681234 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class @@ -58,7 +55,7 @@ LANGUAGES = [ @pytest.mark.parametrize("lang", LANGUAGES) def test_tokenizer_explain(lang): tokenizer = get_lang_class(lang).Defaults.create_tokenizer() - examples = pytest.importorskip("spacy.lang.{}.examples".format(lang)) + examples = pytest.importorskip(f"spacy.lang.{lang}.examples") for sentence in examples.sentences: tokens = [t.text for t in tokenizer(sentence) if not t.is_space] debug_tokens = [t[1] for t in tokenizer.explain(sentence)] diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py index 36c69611e..e93d5654f 100644 --- a/spacy/tests/tokenizer/test_naughty_strings.py +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest # Examples taken from the "Big List of Naughty Strings" diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 5ac681c5e..3dce1ae31 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokenizer import Tokenizer diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index e2c0e3de8..9f673d5d8 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 74c9b369b..c7b9d7c6d 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 175480fe7..0516e9272 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import numpy import tempfile import shutil @@ -9,7 +6,6 @@ import srsly from pathlib import Path from spacy.tokens import Doc, Span from spacy.attrs import POS, HEAD, DEP -from spacy.compat import path2str @contextlib.contextmanager @@ -23,7 +19,7 @@ def make_tempfile(mode="r"): def make_tempdir(): d = Path(tempfile.mkdtemp()) yield d - shutil.rmtree(path2str(d)) + shutil.rmtree(str(d)) def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d84a56981..e033aa7c6 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index f78dd33c4..fff3d24ef 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lookups import Lookups, Table from spacy.strings import get_string_id diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index f98f0e6e0..b5f7303b5 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from spacy.tokens import Doc diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index 75b1116dd..c71d5f3f2 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.strings import StringStore diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index b688ab9dd..8684ad018 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from numpy.testing import assert_allclose diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index d22db2d8b..a687059be 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f0120c708..7491a11fc 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,8 +1,5 @@ # cython: embedsignature=True # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals - from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from libc.string cimport memcpy, memset @@ -11,22 +8,20 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap cimport cython -from collections import OrderedDict import re from .tokens.doc cimport Doc from .strings cimport hash_string -from .compat import unescape_unicode from .attrs import intify_attrs from .symbols import ORTH from .errors import Errors, Warnings, deprecation_warning from . import util - from .attrs import intify_attrs from .lexeme cimport EMPTY_LEXEME from .symbols import ORTH + cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. @@ -728,14 +723,14 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#to_bytes """ - serializers = OrderedDict(( - ("vocab", lambda: self.vocab.to_bytes()), - ("prefix_search", lambda: _get_regex_pattern(self.prefix_search)), - ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), - ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), - ("token_match", lambda: _get_regex_pattern(self.token_match)), - ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) - )) + serializers = { + "vocab": lambda: self.vocab.to_bytes(), + "prefix_search": lambda: _get_regex_pattern(self.prefix_search), + "suffix_search": lambda: _get_regex_pattern(self.suffix_search), + "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer), + "token_match": lambda: _get_regex_pattern(self.token_match), + "exceptions": lambda: dict(sorted(self._rules.items())) + } exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) @@ -748,20 +743,17 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#from_bytes """ - data = OrderedDict() - deserializers = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("prefix_search", lambda b: data.setdefault("prefix_search", b)), - ("suffix_search", lambda b: data.setdefault("suffix_search", b)), - ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), - ("token_match", lambda b: data.setdefault("token_match", b)), - ("exceptions", lambda b: data.setdefault("rules", b)) - )) + data = {} + deserializers = { + "vocab": lambda b: self.vocab.from_bytes(b), + "prefix_search": lambda b: data.setdefault("prefix_search", b), + "suffix_search": lambda b: data.setdefault("suffix_search", b), + "infix_finditer": lambda b: data.setdefault("infix_finditer", b), + "token_match": lambda b: data.setdefault("token_match", b), + "exceptions": lambda b: data.setdefault("rules", b) + } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: - if key in data: - data[key] = unescape_unicode(data[key]) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search if data.get("suffix_search"): diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 536ec8349..88428709b 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .doc import Doc from .token import Token from .span import Span diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a5d06491a..12690ba50 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -1,9 +1,6 @@ -# coding: utf8 # cython: infer_types=True # cython: bounds_check=False # cython: profile=True -from __future__ import unicode_literals - from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free from cymem.cymem cimport Pool diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index b60a6d7b3..d7348659d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy import zlib import srsly diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 716df1087..58423c420 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,10 +1,6 @@ - -# coding: utf8 # cython: infer_types=True # cython: bounds_check=False # cython: profile=True -from __future__ import unicode_literals - cimport cython cimport numpy as np from libc.string cimport memcpy, memset @@ -28,7 +24,7 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS from ..util import normalize_slice -from ..compat import is_config, copy_reg, pickle, basestring_ +from ..compat import copy_reg, pickle from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings from .. import util @@ -327,9 +323,7 @@ cdef class Doc: return "".join([t.text_with_ws for t in self]).encode("utf-8") def __str__(self): - if is_config(python3=True): - return self.__unicode__() - return self.__bytes__() + return self.__unicode__() def __repr__(self): return self.__str__() @@ -683,7 +677,7 @@ cdef class Doc: cdef np.ndarray[attr_t, ndim=2] output # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 - if isinstance(py_attr_ids, basestring_): + if isinstance(py_attr_ids, str): # Handle inputs like doc.to_array('ORTH') py_attr_ids = [py_attr_ids] elif not hasattr(py_attr_ids, "__iter__"): @@ -772,7 +766,7 @@ cdef class Doc: """ # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 - if isinstance(attrs, basestring_): + if isinstance(attrs, str): # Handle inputs like doc.to_array('ORTH') attrs = [attrs] elif not hasattr(attrs, "__iter__"): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 957e853ca..9e9322d65 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - cimport numpy as np from libc.math cimport sqrt @@ -20,7 +17,6 @@ from ..lexeme cimport Lexeme from ..symbols cimport dep from ..util import normalize_slice -from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import deprecation_warning from .underscore import Underscore, get_ext_args @@ -110,9 +106,9 @@ cdef class Span: self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) else: self.end_char = 0 - if isinstance(label, basestring_): + if isinstance(label, str): label = doc.vocab.strings.add(label) - if isinstance(kb_id, basestring_): + if isinstance(kb_id, str): kb_id = doc.vocab.strings.add(kb_id) if label not in doc.vocab.strings: raise ValueError(Errors.E084.format(label=label)) @@ -157,9 +153,7 @@ cdef class Span: return self.end - self.start def __repr__(self): - if is_config(python3=True): - return self.text - return self.text.encode("utf-8") + return self.text def __getitem__(self, object i): """Get a `Token` or a `Span` object @@ -478,7 +472,7 @@ cdef class Span: @property def tensor(self): """The span's slice of the doc's tensor. - + RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array representing the span's semantics. """ diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8b15a4223..8e6290187 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,7 +1,4 @@ # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals - from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free # Compiler crashes on memory view coercion without this. Should report bug. @@ -23,7 +20,6 @@ from ..symbols cimport conj from .. import parts_of_speech from .. import util -from ..compat import is_config from ..errors import Errors, Warnings, user_warning, models_warning from .underscore import Underscore, get_ext_args from .morphanalysis cimport MorphAnalysis @@ -122,9 +118,7 @@ cdef class Token: return self.text.encode('utf8') def __str__(self): - if is_config(python3=True): - return self.__unicode__() - return self.__bytes__() + return self.__unicode__() def __repr__(self): return self.__str__() diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index b36fe9294..328851945 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import functools import copy diff --git a/spacy/util.py b/spacy/util.py index 693136bc1..4e6c10e2b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,12 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - import os import importlib +import importlib.util import re from pathlib import Path import random -from collections import OrderedDict from thinc.neural._classes.model import Model from thinc.neural.ops import NumpyOps import functools @@ -27,8 +24,7 @@ except ImportError: cupy = None from .symbols import ORTH -from .compat import cupy, CudaStream, path2str, basestring_, unicode_ -from .compat import import_file +from .compat import cupy, CudaStream from .errors import Errors, Warnings, deprecation_warning @@ -119,7 +115,7 @@ def ensure_path(path): path: Anything. If string, it's converted to Path. RETURNS: Path or original argument. """ - if isinstance(path, basestring_): + if isinstance(path, str): return Path(path) else: return path @@ -138,7 +134,7 @@ def load_language_data(path): path = path.with_suffix(path.suffix + ".gz") if path.exists(): return srsly.read_gzip_json(path) - raise ValueError(Errors.E160.format(path=path2str(path))) + raise ValueError(Errors.E160.format(path=path)) def get_module_path(module): @@ -156,8 +152,8 @@ def load_model(name, **overrides): """ data_path = get_data_path() if not data_path or not data_path.exists(): - raise IOError(Errors.E049.format(path=path2str(data_path))) - if isinstance(name, basestring_): # in data dir / shortcut + raise IOError(Errors.E049.format(path=data_path)) + if isinstance(name, str): # in data dir / shortcut if name in set([d.name for d in data_path.iterdir()]): return load_model_from_link(name, **overrides) if is_package(name): # installed as package @@ -224,7 +220,7 @@ def load_model_from_init_py(init_file, **overrides): data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"]) data_path = model_path / data_dir if not model_path.exists(): - raise IOError(Errors.E052.format(path=path2str(data_path))) + raise IOError(Errors.E052.format(path=data_path)) return load_model_from_path(data_path, meta, **overrides) @@ -236,7 +232,7 @@ def get_model_meta(path): """ model_path = ensure_path(path) if not model_path.exists(): - raise IOError(Errors.E052.format(path=path2str(model_path))) + raise IOError(Errors.E052.format(path=model_path)) meta_path = model_path / "meta.json" if not meta_path.is_file(): raise IOError(Errors.E053.format(path=meta_path)) @@ -417,7 +413,7 @@ def update_exc(base_exceptions, *addition_dicts): exc = dict(base_exceptions) for additions in addition_dicts: for orth, token_attrs in additions.items(): - if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): + if not all(isinstance(attr[ORTH], str) for attr in token_attrs): raise ValueError(Errors.E055.format(key=orth, orths=token_attrs)) described_orth = "".join(attr[ORTH] for attr in token_attrs) if orth != described_orth: @@ -612,7 +608,7 @@ def filter_spans(spans): def to_bytes(getters, exclude): - serialized = OrderedDict() + serialized = {} for key, getter in getters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude: @@ -649,6 +645,20 @@ def from_disk(path, readers, exclude): return path +def import_file(name, loc): + """Import module from a file. Used to load models from a directory. + + name (unicode): Name of module to load. + loc (unicode / Path): Path to the file. + RETURNS: The loaded module. + """ + loc = str(loc) + spec = importlib.util.spec_from_file_location(name, str(loc)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. Disclaimer: NOT a general-purpose solution, only removes indentation and @@ -726,8 +736,8 @@ def validate_json(data, validator): err_path = "" msg = err.message + " " + err_path if err.context: # Error has suberrors, e.g. if schema uses anyOf - suberrs = [" - {}".format(suberr.message) for suberr in err.context] - msg += ":\n{}".format("".join(suberrs)) + suberrs = [f" - {suberr.message}" for suberr in err.context] + msg += f":\n{''.join(suberrs)}" errors.append(msg) return errors diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6b26bf123..b12c8d833 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - cimport numpy as np from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset import functools import numpy -from collections import OrderedDict import srsly from thinc.neural.util import get_array_module from thinc.neural._classes.model import Model @@ -15,7 +11,6 @@ from thinc.neural._classes.model import Model from .strings cimport StringStore from .strings import get_string_id -from .compat import basestring_, path2str from .errors import Errors from . import util @@ -74,7 +69,7 @@ cdef class Vectors: shape = (0,0) data = numpy.zeros(shape, dtype="f") self.data = data - self.key2row = OrderedDict() + self.key2row = {} if self.data is not None: self._unset = cppset[int]({i for i in range(self.data.shape[0])}) else: @@ -339,7 +334,7 @@ cdef class Vectors: sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] scores[i:i+batch_size] = scores[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index] - + xp = get_array_module(self.data) # Round values really close to 1 or -1 scores = xp.around(scores, decimals=4, out=scores) @@ -347,7 +342,7 @@ cdef class Vectors: scores = xp.clip(scores, a_min=-1, a_max=1, out=scores) row2key = {row: key for key, row in self.key2row.items()} keys = xp.asarray( - [[row2key[row] for row in best_rows[i] if row in row2key] + [[row2key[row] for row in best_rows[i] if row in row2key] for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) @@ -372,7 +367,7 @@ cdef class Vectors: break else: raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) + bin_loc = path / f"vectors.{dims}.{dtype}.bin" xp = get_array_module(self.data) self.data = None with bin_loc.open("rb") as file_: @@ -402,10 +397,10 @@ cdef class Vectors: save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) - serializers = OrderedDict(( - ("vectors", lambda p: save_array(self.data, p.open("wb"))), - ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) - )) + serializers = { + "vectors": lambda p: save_array(self.data, p.open("wb")), + "key2row": lambda p: srsly.write_msgpack(p, self.key2row) + } return util.to_disk(path, serializers, []) def from_disk(self, path, **kwargs): @@ -435,11 +430,11 @@ cdef class Vectors: if path.exists(): self.data = xp.load(str(path)) - serializers = OrderedDict(( - ("key2row", load_key2row), - ("keys", load_keys), - ("vectors", load_vectors), - )) + serializers = { + "key2row": load_key2row, + "keys": load_keys, + "vectors": load_vectors, + } util.from_disk(path, serializers, []) return self @@ -457,10 +452,10 @@ cdef class Vectors: else: return srsly.msgpack_dumps(self.data) - serializers = OrderedDict(( - ("key2row", lambda: srsly.msgpack_dumps(self.key2row)), - ("vectors", serialize_weights) - )) + serializers = { + "key2row": lambda: srsly.msgpack_dumps(self.key2row), + "vectors": serialize_weights + } return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): @@ -478,9 +473,9 @@ cdef class Vectors: else: self.data = srsly.msgpack_loads(b) - deserializers = OrderedDict(( - ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), - ("vectors", deserialize_weights) - )) + deserializers = { + "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)), + "vectors": deserialize_weights + } util.from_bytes(data, deserializers, []) return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3cf0095ee..c7e74f36c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,10 +1,7 @@ -# coding: utf8 # cython: profile=True -from __future__ import unicode_literals from libc.string cimport memcpy import srsly -from collections import OrderedDict from thinc.neural.util import get_array_module from .lexeme cimport EMPTY_LEXEME @@ -14,7 +11,7 @@ from .tokens.token cimport Token from .attrs cimport PROB, LANG, ORTH, TAG, POS from .structs cimport SerializedLexemeC -from .compat import copy_reg, basestring_ +from .compat import copy_reg from .errors import Errors from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM @@ -335,14 +332,14 @@ cdef class Vocab: """Retrieve a vector for a word in the vocabulary. Words can be looked up by string or int ID. If no vectors data is loaded, ValueError is raised. - - If `minn` is defined, then the resulting vector uses Fasttext's + + If `minn` is defined, then the resulting vector uses Fasttext's subword features by average over ngrams of `orth`. orth (int / unicode): The hash value of a word, or its unicode string. - minn (int): Minimum n-gram length used for Fasttext's ngram computation. + minn (int): Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. - maxn (int): Maximum n-gram length used for Fasttext's ngram computation. + maxn (int): Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. RETURNS (numpy.ndarray): A word vector. Size and shape determined by the `vocab.vectors` instance. Usually, a @@ -350,7 +347,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#get_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) word = self[orth].orth_ if orth in self.vectors.key2row: @@ -397,7 +394,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#set_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) if self.vectors.is_full and orth not in self.vectors: new_rows = max(100, int(self.vectors.shape[0]*1.3)) @@ -419,7 +416,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#has_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) return orth in self.vectors @@ -488,12 +485,12 @@ cdef class Vocab: else: return self.vectors.to_bytes() - getters = OrderedDict(( - ("strings", lambda: self.strings.to_bytes()), - ("lexemes", lambda: self.lexemes_to_bytes()), - ("vectors", deserialize_vectors), - ("lookups", lambda: self.lookups.to_bytes()) - )) + getters = { + "strings": lambda: self.strings.to_bytes(), + "lexemes": lambda: self.lexemes_to_bytes(), + "vectors": deserialize_vectors, + "lookups": lambda: self.lookups.to_bytes() + } exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -512,12 +509,12 @@ cdef class Vocab: else: return self.vectors.from_bytes(b) - setters = OrderedDict(( - ("strings", lambda b: self.strings.from_bytes(b)), - ("lexemes", lambda b: self.lexemes_from_bytes(b)), - ("vectors", lambda b: serialize_vectors(b)), - ("lookups", lambda b: self.lookups.from_bytes(b)) - )) + setters = { + "strings": lambda b: self.strings.from_bytes(b), + "lexemes": lambda b: self.lexemes_from_bytes(b), + "vectors": lambda b: serialize_vectors(b), + "lookups": lambda b: self.lookups.from_bytes(b) + } exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) if self.vectors.name is not None: diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 50ba0e3d9..c9c7a010c 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -677,50 +677,3 @@ of one entity) or when merging spans with | ----------- | -------- | -------------------- | | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | - -## Compatibility functions {#compat source="spacy/compaty.py"} - -All Python code is written in an **intersection of Python 2 and Python 3**. This -is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or -platform compatibility only lives in `spacy.compat`. To distinguish them from -the builtin functions, replacement functions are suffixed with an underscore, -e.g. `unicode_`. - -> #### Example -> -> ```python -> from spacy.compat import unicode_ -> -> compatible_unicode = unicode_("hello world") -> ``` - -| Name | Python 2 | Python 3 | -| -------------------- | ---------------------------------- | ----------- | -| `compat.bytes_` | `str` | `bytes` | -| `compat.unicode_` | `unicode` | `str` | -| `compat.basestring_` | `basestring` | `str` | -| `compat.input_` | `raw_input` | `input` | -| `compat.path2str` | `str(path)` with `.decode('utf8')` | `str(path)` | - -### compat.is_config {#compat.is_config tag="function"} - -Check if a specific configuration of Python version and operating system matches -the user's setup. Mostly used to display targeted error messages. - -> #### Example -> -> ```python -> from spacy.compat import is_config -> -> if is_config(python2=True, windows=True): -> print("You are using Python 2 on Windows.") -> ``` - -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------------------------- | -| `python2` | bool | spaCy is executed with Python 2.x. | -| `python3` | bool | spaCy is executed with Python 3.x. | -| `windows` | bool | spaCy is executed on Windows. | -| `linux` | bool | spaCy is executed on Linux. | -| `osx` | bool | spaCy is executed on OS X or macOS. | -| **RETURNS** | bool | Whether the specified configuration matches the user's platform. | diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 2b0045bc3..6c398d584 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -8,9 +8,9 @@ menu: - ['Changelog', 'changelog'] --- -spaCy is compatible with **64-bit CPython 2.7 / 3.5+** and runs on -**Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are -available over [pip](https://pypi.python.org/pypi/spacy) and +spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**, +**macOS/OS X** and **Windows**. The latest spaCy releases are available over +[pip](https://pypi.python.org/pypi/spacy) and [conda](https://anaconda.org/conda-forge/spacy). > #### 📖 Looking for the old docs? @@ -207,14 +207,7 @@ Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/) -that matches the version that was used to compile your Python interpreter. For -official distributions these are: - -| Distribution | Version | -| ------------ | ------------------ | -| Python 2.7 | Visual Studio 2008 | -| Python 3.4 | Visual Studio 2010 | -| Python 3.5+ | Visual Studio 2015 | +that matches the version that was used to compile your Python interpreter. ### Run tests {#run-tests} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index b7b840999..7382f2b8c 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -367,7 +367,7 @@ tokens and a conditional message based on the document length. import spacy def my_component(doc): - print("After tokenization, this doc has {} tokens.".format(len(doc))) + print(f"After tokenization, this doc has {len(doc)} tokens.") print("The part-of-speech tags are:", [token.pos_ for token in doc]) if len(doc) < 10: print("This is a pretty short document.") @@ -602,7 +602,7 @@ There are three main types of extensions, which can be defined using the [these examples](/usage/examples#custom-components-attr-methods). ```python - Doc.set_extension("hello", method=lambda doc, name: "Hi {}!".format(name)) + Doc.set_extension("hello", method=lambda doc, name: f"Hi {name}!") assert doc._.hello("Bob") == "Hi Bob!" ``` diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 5a3a95a53..479bdd264 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -304,12 +304,6 @@ print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' ``` -> #### What does 'L' at the end of a hash mean? -> -> If you return a hash value in the **Python 2 interpreter**, it'll show up as -> `3197928453018144401L`. The `L` just means "long integer" – it's **not** -> actually a part of the hash value. - Now that all strings are encoded, the entries in the vocabulary **don't need to include the word text** themselves. Instead, they can look it up in the `StringStore` via its hash value. Each entry in the vocabulary, also called @@ -857,17 +851,16 @@ def put_spans_around_tokens(doc): and you can calculate what you need, e.g.
,

etc.) """ output = [] - html = '{word}{space}' for token in doc: if token.is_space: output.append(token.text) else: - classes = "pos-{} dep-{}".format(token.pos_, token.dep_) - output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) + classes = f"pos-{token.pos_} dep-{token.dep_}" + output.append(f'{token.text}{token.whitespace_}') string = "".join(output) string = string.replace("\\n", "") string = string.replace("\\t", " ") - return "

{}
".format(string) + return f"
{string}
" nlp = spacy.load("en_core_web_sm") From 33a2682d60c753469d78cf68b6065a284e774f40 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Dec 2019 12:39:49 +0100 Subject: [PATCH 018/187] Add better schemas and validation using Pydantic (#4831) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Add better schemas and validation using Pydantic * Revert lookups.md * Remove unused import * Update spacy/schemas.py Co-Authored-By: Sebastián Ramírez * Various small fixes * Fix docstring Co-authored-by: Sebastián Ramírez --- requirements.txt | 3 +- setup.cfg | 1 + spacy/cli/_schemas.py | 217 ------------------ spacy/errors.py | 3 - spacy/matcher/_schemas.py | 197 ---------------- spacy/matcher/dependencymatcher.pyx | 5 +- spacy/matcher/matcher.pxd | 2 +- spacy/matcher/matcher.pyx | 18 +- spacy/matcher/phrasematcher.pyx | 4 +- spacy/schemas.py | 188 +++++++++++++++ spacy/tests/doc/test_to_json.py | 9 - spacy/tests/matcher/test_matcher_api.py | 2 +- spacy/tests/matcher/test_matcher_logic.py | 12 +- .../tests/matcher/test_pattern_validation.py | 30 +-- spacy/tests/test_json_schemas.py | 47 ---- spacy/util.py | 42 ---- 16 files changed, 217 insertions(+), 563 deletions(-) delete mode 100644 spacy/cli/_schemas.py delete mode 100644 spacy/matcher/_schemas.py create mode 100644 spacy/schemas.py delete mode 100644 spacy/tests/test_json_schemas.py diff --git a/requirements.txt b/requirements.txt index 188459c67..79a05b2bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,8 +12,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 -# Optional dependencies -jsonschema>=2.6.0,<3.1.0 +pydantic>=1.0.0,<2.0.0 # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index 28259c989..755f522e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 + pydantic>=1.0.0,<2.0.0 [options.extras_require] lookups = diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py deleted file mode 100644 index 42e5e04dd..000000000 --- a/spacy/cli/_schemas.py +++ /dev/null @@ -1,217 +0,0 @@ - -# NB: This schema describes the new format of the training data, see #2928 -TRAINING_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "title": "Training data for spaCy models", - "type": "array", - "items": { - "type": "object", - "properties": { - "text": { - "title": "The text of the training example", - "type": "string", - "minLength": 1, - }, - "ents": { - "title": "Named entity spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0, - }, - "label": { - "title": "Entity label", - "type": "string", - "minLength": 1, - "pattern": "^[A-Z0-9]*$", - }, - }, - "required": ["start", "end", "label"], - }, - }, - "sents": { - "title": "Sentence spans in the text", - "type": "array", - "items": { - "type": "object", - "properties": { - "start": { - "title": "Start character offset of the span", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the span", - "type": "integer", - "minimum": 0, - }, - }, - "required": ["start", "end"], - }, - }, - "cats": { - "title": "Text categories for the text classifier", - "type": "object", - "patternProperties": { - "*": { - "title": "A text category", - "oneOf": [ - {"type": "boolean"}, - {"type": "number", "minimum": 0}, - ], - } - }, - "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1}, - }, - "tokens": { - "title": "The tokens in the text", - "type": "array", - "items": { - "type": "object", - "minProperties": 1, - "properties": { - "id": { - "title": "Token ID, usually token index", - "type": "integer", - "minimum": 0, - }, - "start": { - "title": "Start character offset of the token", - "type": "integer", - "minimum": 0, - }, - "end": { - "title": "End character offset of the token", - "type": "integer", - "minimum": 0, - }, - "pos": { - "title": "Coarse-grained part-of-speech tag", - "type": "string", - "minLength": 1, - }, - "tag": { - "title": "Fine-grained part-of-speech tag", - "type": "string", - "minLength": 1, - }, - "dep": { - "title": "Dependency label", - "type": "string", - "minLength": 1, - }, - "head": { - "title": "Index of the token's head", - "type": "integer", - "minimum": 0, - }, - }, - "required": ["start", "end"], - }, - }, - "_": {"title": "Custom user space", "type": "object"}, - }, - "required": ["text"], - }, -} - -META_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "type": "object", - "properties": { - "lang": { - "title": "Two-letter language code, e.g. 'en'", - "type": "string", - "minLength": 2, - "maxLength": 2, - "pattern": "^[a-z]*$", - }, - "name": { - "title": "Model name", - "type": "string", - "minLength": 1, - "pattern": "^[a-z_]*$", - }, - "version": { - "title": "Model version", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-]*$", - }, - "spacy_version": { - "title": "Compatible spaCy version identifier", - "type": "string", - "minLength": 1, - "pattern": "^[0-9a-z.-><=]*$", - }, - "parent_package": { - "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", - "type": "string", - "minLength": 1, - "default": "spacy", - }, - "pipeline": { - "title": "Names of pipeline components", - "type": "array", - "items": {"type": "string", "minLength": 1}, - }, - "description": {"title": "Model description", "type": "string"}, - "license": {"title": "Model license", "type": "string"}, - "author": {"title": "Model author name", "type": "string"}, - "email": {"title": "Model author email", "type": "string", "format": "email"}, - "url": {"title": "Model author URL", "type": "string", "format": "uri"}, - "sources": { - "title": "Training data sources", - "type": "array", - "items": {"type": "string"}, - }, - "vectors": { - "title": "Included word vectors", - "type": "object", - "properties": { - "keys": { - "title": "Number of unique keys", - "type": "integer", - "minimum": 0, - }, - "vectors": { - "title": "Number of unique vectors", - "type": "integer", - "minimum": 0, - }, - "width": { - "title": "Number of dimensions", - "type": "integer", - "minimum": 0, - }, - }, - }, - "accuracy": { - "title": "Accuracy numbers", - "type": "object", - "patternProperties": {"*": {"type": "number", "minimum": 0.0}}, - }, - "speed": { - "title": "Speed evaluation numbers", - "type": "object", - "patternProperties": { - "*": { - "oneOf": [ - {"type": "number", "minimum": 0.0}, - {"type": "integer", "minimum": 0}, - ] - } - }, - }, - }, - "required": ["lang", "name", "version"], -} diff --git a/spacy/errors.py b/spacy/errors.py index 81747b33b..3aa4bedea 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -105,7 +105,6 @@ class Warnings(object): "smaller JSON files instead.") - @add_codes class Errors(object): E001 = ("No component '{name}' found in pipeline. Available names: {opts}") @@ -419,8 +418,6 @@ class Errors(object): E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") E135 = ("If you meant to replace a built-in component, use `create_pipe`: " "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`") - E136 = ("This additional feature requires the jsonschema library to be " - "installed:\npip install jsonschema") E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure " "to provide a valid JSON object as input with either the `text` " "or `tokens` key. For more info, see the docs:\n" diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py deleted file mode 100644 index ce6379c45..000000000 --- a/spacy/matcher/_schemas.py +++ /dev/null @@ -1,197 +0,0 @@ - -TOKEN_PATTERN_SCHEMA = { - "$schema": "http://json-schema.org/draft-06/schema", - "definitions": { - "string_value": { - "anyOf": [ - {"type": "string"}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": {"type": "array", "items": {"type": "string"}}, - "NOT_IN": {"type": "array", "items": {"type": "string"}}, - }, - "additionalProperties": False, - }, - ] - }, - "integer_value": { - "anyOf": [ - {"type": "integer"}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": {"type": "array", "items": {"type": "integer"}}, - "NOT_IN": {"type": "array", "items": {"type": "integer"}}, - "==": {"type": "integer"}, - ">=": {"type": "integer"}, - "<=": {"type": "integer"}, - ">": {"type": "integer"}, - "<": {"type": "integer"}, - }, - "additionalProperties": False, - }, - ] - }, - "boolean_value": {"type": "boolean"}, - "underscore_value": { - "anyOf": [ - {"type": ["string", "integer", "number", "array", "boolean", "null"]}, - { - "type": "object", - "properties": { - "REGEX": {"type": "string"}, - "IN": { - "type": "array", - "items": {"type": ["string", "integer"]}, - }, - "NOT_IN": { - "type": "array", - "items": {"type": ["string", "integer"]}, - }, - "==": {"type": "integer"}, - ">=": {"type": "integer"}, - "<=": {"type": "integer"}, - ">": {"type": "integer"}, - "<": {"type": "integer"}, - }, - "additionalProperties": False, - }, - ] - }, - }, - "type": "array", - "items": { - "type": "object", - "properties": { - "ORTH": { - "title": "Verbatim token text", - "$ref": "#/definitions/string_value", - }, - "TEXT": { - "title": "Verbatim token text (spaCy v2.1+)", - "$ref": "#/definitions/string_value", - }, - "LOWER": { - "title": "Lowercase form of token text", - "$ref": "#/definitions/string_value", - }, - "POS": { - "title": "Coarse-grained part-of-speech tag", - "$ref": "#/definitions/string_value", - }, - "TAG": { - "title": "Fine-grained part-of-speech tag", - "$ref": "#/definitions/string_value", - }, - "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, - "LEMMA": { - "title": "Lemma (base form)", - "$ref": "#/definitions/string_value", - }, - "SHAPE": { - "title": "Abstract token shape", - "$ref": "#/definitions/string_value", - }, - "ENT_TYPE": { - "title": "Entity label of single token", - "$ref": "#/definitions/string_value", - }, - "NORM": { - "title": "Normalized form of the token text", - "$ref": "#/definitions/string_value", - }, - "LENGTH": { - "title": "Token character length", - "$ref": "#/definitions/integer_value", - }, - "IS_ALPHA": { - "title": "Token consists of alphabetic characters", - "$ref": "#/definitions/boolean_value", - }, - "IS_ASCII": { - "title": "Token consists of ASCII characters", - "$ref": "#/definitions/boolean_value", - }, - "IS_DIGIT": { - "title": "Token consists of digits", - "$ref": "#/definitions/boolean_value", - }, - "IS_LOWER": { - "title": "Token is lowercase", - "$ref": "#/definitions/boolean_value", - }, - "IS_UPPER": { - "title": "Token is uppercase", - "$ref": "#/definitions/boolean_value", - }, - "IS_TITLE": { - "title": "Token is titlecase", - "$ref": "#/definitions/boolean_value", - }, - "IS_PUNCT": { - "title": "Token is punctuation", - "$ref": "#/definitions/boolean_value", - }, - "IS_SPACE": { - "title": "Token is whitespace", - "$ref": "#/definitions/boolean_value", - }, - "IS_BRACKET": { - "title": "Token is a bracket", - "$ref": "#/definitions/boolean_value", - }, - "IS_QUOTE": { - "title": "Token is a quotation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_LEFT_PUNCT": { - "title": "Token is a left punctuation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_RIGHT_PUNCT": { - "title": "Token is a right punctuation mark", - "$ref": "#/definitions/boolean_value", - }, - "IS_CURRENCY": { - "title": "Token is a currency symbol", - "$ref": "#/definitions/boolean_value", - }, - "IS_STOP": { - "title": "Token is stop word", - "$ref": "#/definitions/boolean_value", - }, - "IS_SENT_START": { - "title": "Token is the first in a sentence", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_NUM": { - "title": "Token resembles a number", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_URL": { - "title": "Token resembles a URL", - "$ref": "#/definitions/boolean_value", - }, - "LIKE_EMAIL": { - "title": "Token resembles an email address", - "$ref": "#/definitions/boolean_value", - }, - "_": { - "title": "Custom extension token attributes (token._.)", - "type": "object", - "patternProperties": { - "^.*$": {"$ref": "#/definitions/underscore_value"} - }, - }, - "OP": { - "title": "Operators / quantifiers", - "type": "string", - "enum": ["+", "*", "?", "!"], - }, - }, - "additionalProperties": False, - }, -} diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 46cff0d0c..f94c66cb0 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -39,7 +39,8 @@ cdef class DependencyMatcher: RETURNS (DependencyMatcher): The newly constructed object. """ size = 20 - self.token_matcher = Matcher(vocab) + # TODO: make matcher work with validation + self.token_matcher = Matcher(vocab, validate=False) self._keys_to_token = {} self._patterns = {} self._root = {} @@ -129,7 +130,7 @@ cdef class DependencyMatcher: # TODO: Better ways to hash edges in pattern? for j in range(len(_patterns[i])): k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j)) - self.token_matcher.add(k, None, _patterns[i][j]) + self.token_matcher.add(k, [_patterns[i][j]]) _keys_to_token[k] = j _keys_to_token_list.append(_keys_to_token) self._keys_to_token.setdefault(key, []) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index dd04153bf..689734079 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -63,7 +63,7 @@ cdef class Matcher: cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab - cdef public object validator + cdef public object validate cdef public object _patterns cdef public object _callbacks cdef public object _extensions diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2908ab0c2..4258fdb6a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -15,8 +15,7 @@ from ..tokens.doc cimport Doc, get_token_attr from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA -from ._schemas import TOKEN_PATTERN_SCHEMA -from ..util import get_json_validator, validate_json +from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning from ..strings import get_string_id from ..attrs import IDS @@ -32,7 +31,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=False): + def __init__(self, vocab, validate=True): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -46,10 +45,7 @@ cdef class Matcher: self._seen_attrs = set() self.vocab = vocab self.mem = Pool() - if validate: - self.validator = get_json_validator(TOKEN_PATTERN_SCHEMA) - else: - self.validator = None + self.validate = validate def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -119,8 +115,8 @@ cdef class Matcher: raise ValueError(Errors.E012.format(key=key)) if not isinstance(pattern, list): raise ValueError(Errors.E178.format(pat=pattern, key=key)) - if self.validator: - errors[i] = validate_json(pattern, self.validator) + if self.validate: + errors[i] = validate_token_pattern(pattern) if any(err for err in errors.values()): raise MatchPatternError(key, errors) key = self._normalize_key(key) @@ -668,8 +664,6 @@ def _get_attr_values(spec, string_store): continue if attr == "TEXT": attr = "ORTH" - if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: - raise ValueError(Errors.E152.format(attr=attr)) attr = IDS.get(attr) if isinstance(value, basestring): value = string_store.add(value) @@ -684,7 +678,7 @@ def _get_attr_values(spec, string_store): if attr is not None: attr_values.append((attr, value)) else: - # should be caught above using TOKEN_PATTERN_SCHEMA + # should be caught in validation raise ValueError(Errors.E152.format(attr=attr)) return attr_values diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 20f45b9e4..961a318f6 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -9,7 +9,7 @@ from ..structs cimport TokenC from ..tokens.token cimport Token from ..typedefs cimport attr_t -from ._schemas import TOKEN_PATTERN_SCHEMA +from ..schemas import TokenPattern from ..errors import Errors, Warnings, deprecation_warning, user_warning @@ -54,7 +54,7 @@ cdef class PhraseMatcher: attr = attr.upper() if attr == "TEXT": attr = "ORTH" - if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: + if attr.lower() not in TokenPattern().dict(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = self.vocab.strings[attr] diff --git a/spacy/schemas.py b/spacy/schemas.py new file mode 100644 index 000000000..4a5054125 --- /dev/null +++ b/spacy/schemas.py @@ -0,0 +1,188 @@ +from typing import Dict, List, Union, Optional +from enum import Enum +from pydantic import BaseModel, Field, ValidationError, validator +from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from collections import defaultdict + +from .attrs import NAMES + + +def validate(schema, obj): + """Validate data against a given pydantic schema. + + obj (dict): JSON-serializable data to validate. + schema (pydantic.BaseModel): The schema to validate against. + RETURNS (list): A list of error messages, if available. + """ + try: + schema(**obj) + return [] + except ValidationError as e: + errors = e.errors() + data = defaultdict(list) + for error in errors: + err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) + data[err_loc].append(error.get("msg")) + return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] + + +# Matcher token patterns + + +def validate_token_pattern(obj): + # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"}) + get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k + if isinstance(obj, list): + converted = [] + for pattern in obj: + if isinstance(pattern, dict): + pattern = {get_key(k): v for k, v in pattern.items()} + converted.append(pattern) + obj = converted + return validate(TokenPatternSchema, {"pattern": obj}) + + +class TokenPatternString(BaseModel): + REGEX: Optional[StrictStr] + IN: Optional[List[StrictStr]] + NOT_IN: Optional[List[StrictStr]] + + class Config: + extra = "forbid" + + @validator("*", pre=True, whole=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternNumber(BaseModel): + REGEX: Optional[StrictStr] = None + IN: Optional[List[StrictInt]] = None + NOT_IN: Optional[List[StrictInt]] = None + EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") + GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") + LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") + GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") + LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") + + class Config: + extra = "forbid" + + @validator("*", pre=True, whole=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternOperator(str, Enum): + plus: StrictStr = "+" + start: StrictStr = "*" + question: StrictStr = "?" + exclamation: StrictStr = "!" + + +StringValue = Union[TokenPatternString, StrictStr] +NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] +UnderscoreValue = Union[ + TokenPatternString, TokenPatternNumber, str, int, float, list, bool, +] + + +class TokenPattern(BaseModel): + orth: Optional[StringValue] = None + text: Optional[StringValue] = None + lower: Optional[StringValue] = None + pos: Optional[StringValue] = None + tag: Optional[StringValue] = None + dep: Optional[StringValue] = None + lemma: Optional[StringValue] = None + shape: Optional[StringValue] = None + ent_type: Optional[StringValue] = None + norm: Optional[StringValue] = None + length: Optional[NumberValue] = None + is_alpha: Optional[StrictBool] = None + is_ascii: Optional[StrictBool] = None + is_digit: Optional[StrictBool] = None + is_lower: Optional[StrictBool] = None + is_upper: Optional[StrictBool] = None + is_title: Optional[StrictBool] = None + is_punct: Optional[StrictBool] = None + is_space: Optional[StrictBool] = None + is_bracket: Optional[StrictBool] = None + is_quote: Optional[StrictBool] = None + is_left_punct: Optional[StrictBool] = None + is_right_punct: Optional[StrictBool] = None + is_currency: Optional[StrictBool] = None + is_stop: Optional[StrictBool] = None + is_sent_start: Optional[StrictBool] = None + like_num: Optional[StrictBool] = None + like_url: Optional[StrictBool] = None + like_email: Optional[StrictBool] = None + op: Optional[TokenPatternOperator] = None + underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") + + class Config: + extra = "forbid" + allow_population_by_field_name = True + alias_generator = lambda value: value.upper() + + @validator("*", pre=True) + def raise_for_none(cls, v): + if v is None: + raise ValueError("None / null is not allowed") + return v + + +class TokenPatternSchema(BaseModel): + pattern: List[TokenPattern] = Field(..., minItems=1) + + class Config: + extra = "forbid" + + +# Model meta + + +class ModelMetaSchema(BaseModel): + # fmt: off + lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'") + name: StrictStr = Field(..., title="Model name") + version: StrictStr = Field(..., title="Model version") + spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier") + parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") + pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components") + description: Optional[StrictStr] = Field(None, title="Model description") + license: Optional[StrictStr] = Field(None, title="Model license") + author: Optional[StrictStr] = Field(None, title="Model author name") + email: Optional[StrictStr] = Field(None, title="Model author email") + url: Optional[StrictStr] = Field(None, title="Model author URL") + sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") + vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors") + accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") + speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") + # fmt: on + + +# Training data object in "simple training style" + + +class SimpleTrainingSchema(BaseModel): + # TODO: write + + class Config: + title = "Schema for training data dict in passed to nlp.update" + extra = "forbid" + + +# JSON training format + + +class TrainingSchema(BaseModel): + # TODO: write + + class Config: + title = "Schema for training data in spaCy's JSON format" + extra = "forbid" diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 18243c306..da3bc7dbb 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,6 +1,4 @@ import pytest -from spacy.cli._schemas import TRAINING_SCHEMA -from spacy.util import get_json_validator, validate_json from spacy.tokens import Doc from ..util import get_doc @@ -55,10 +53,3 @@ def test_doc_to_json_underscore_error_serialize(doc): Doc.set_extension("json_test4", method=lambda doc: doc.text) with pytest.raises(ValueError): doc.to_json(underscore=["json_test4"]) - - -def test_doc_to_json_valid_training(doc): - json_doc = doc.to_json() - validator = get_json_validator(TRAINING_SCHEMA) - errors = validate_json([json_doc], validator) - assert not errors diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index adeef834d..3900f1e68 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -179,7 +179,7 @@ def test_matcher_match_one_plus(matcher): doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 - pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}] + pattern = [{"ORTH": "Philippe"}, {"ORTH": "Philippe", "OP": "+"}] matcher.add("KleenePhilippe", [pattern]) m = matcher(doc) assert len(m) == 1 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index a6a82f2e2..a2b2cd83f 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -6,18 +6,18 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span -pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}] -pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}] -pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}] +pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] +pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}] +pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] pattern4 = [ - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, ] pattern5 = [ {"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, - {"ORTH": "B", "OP": "1"}, + {"ORTH": "B"}, ] re_pattern1 = "AA*" diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index c879cc0fe..ade724d05 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -1,8 +1,7 @@ import pytest from spacy.matcher import Matcher -from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA from spacy.errors import MatchPatternError -from spacy.util import get_json_validator, validate_json +from spacy.schemas import validate_token_pattern # (pattern, num errors with validation, num errors identified with minimal # checks) @@ -15,12 +14,12 @@ TEST_PATTERNS = [ ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0), + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) # Bad patterns not flagged with minimal checks ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0), - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0), - ([{"LENGTH": {"VALUE": 5}}], 1, 0), - ([{"TEXT": {"VALUE": "foo"}}], 1, 0), + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 4, 0), # prev: (2, 0) + ([{"LENGTH": {"VALUE": 5}}], 2, 0), # prev: (1, 0) + ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) ([{"IS_DIGIT": -1}], 1, 0), ([{"ORTH": -1}], 1, 0), # Good patterns @@ -31,15 +30,9 @@ TEST_PATTERNS = [ ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0), ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0), ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0), + ([{"orth": "foo"}], 0, 0), # prev: xfail ] -XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)] - - -@pytest.fixture -def validator(): - return get_json_validator(TOKEN_PATTERN_SCHEMA) - @pytest.mark.parametrize( "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] @@ -51,15 +44,8 @@ def test_matcher_pattern_validation(en_vocab, pattern): @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS) -def test_pattern_validation(validator, pattern, n_errors, _): - errors = validate_json(pattern, validator) - assert len(errors) == n_errors - - -@pytest.mark.xfail -@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS) -def test_xfail_pattern_validation(validator, pattern, n_errors, _): - errors = validate_json(pattern, validator) +def test_pattern_validation(pattern, n_errors, _): + errors = validate_token_pattern(pattern) assert len(errors) == n_errors diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py deleted file mode 100644 index 1330d3a65..000000000 --- a/spacy/tests/test_json_schemas.py +++ /dev/null @@ -1,47 +0,0 @@ -from spacy.util import get_json_validator, validate_json, validate_schema -from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA -from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA -import pytest - - -@pytest.fixture(scope="session") -def training_schema_validator(): - return get_json_validator(TRAINING_SCHEMA) - - -def test_validate_schema(): - validate_schema({"type": "object"}) - with pytest.raises(Exception): - validate_schema({"type": lambda x: x}) - - -@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA, TOKEN_PATTERN_SCHEMA]) -def test_schemas(schema): - validate_schema(schema) - - -@pytest.mark.parametrize( - "data", - [ - {"text": "Hello world"}, - {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, - ], -) -def test_json_schema_training_valid(data, training_schema_validator): - errors = validate_json([data], training_schema_validator) - assert not errors - - -@pytest.mark.parametrize( - "data,n_errors", - [ - ({"spans": []}, 1), - ({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2), - ({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1), - ({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1), - ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), - ], -) -def test_json_schema_training_invalid(data, n_errors, training_schema_validator): - errors = validate_json([data], training_schema_validator) - assert len(errors) == n_errors diff --git a/spacy/util.py b/spacy/util.py index 4e6c10e2b..57bbee69f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,11 +13,6 @@ import srsly import catalogue import sys -try: - import jsonschema -except ImportError: - jsonschema = None - try: import cupy.random except ImportError: @@ -705,43 +700,6 @@ def fix_random_seed(seed=0): cupy.random.seed(seed) -def get_json_validator(schema): - # We're using a helper function here to make it easier to change the - # validator that's used (e.g. different draft implementation), without - # having to change it all across the codebase. - # TODO: replace with (stable) Draft6Validator, if available - if jsonschema is None: - raise ValueError(Errors.E136) - return jsonschema.Draft4Validator(schema) - - -def validate_schema(schema): - """Validate a given schema. This just checks if the schema itself is valid.""" - validator = get_json_validator(schema) - validator.check_schema(schema) - - -def validate_json(data, validator): - """Validate data against a given JSON schema (see https://json-schema.org). - - data: JSON-serializable data to validate. - validator (jsonschema.DraftXValidator): The validator. - RETURNS (list): A list of error messages, if available. - """ - errors = [] - for err in sorted(validator.iter_errors(data), key=lambda e: e.path): - if err.path: - err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) - else: - err_path = "" - msg = err.message + " " + err_path - if err.context: # Error has suberrors, e.g. if schema uses anyOf - suberrs = [f" - {suberr.message}" for suberr in err.context] - msg += f":\n{''.join(suberrs)}" - errors.append(msg) - return errors - - def get_serialization_exclude(serializers, exclude, kwargs): """Helper function to validate serialization args and manage transition from keyword arguments (pre v2.1) to exclude argument. From c22f0755098ba153d3617320e3c70fe64fdac2d1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Dec 2019 17:29:53 +0100 Subject: [PATCH 019/187] Update pydantic version pin [ci skip] --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 755f522e7..9516a3dda 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ install_requires = numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - pydantic>=1.0.0,<2.0.0 + pydantic>=1.3.0,<2.0.0 [options.extras_require] lookups = From a892821c51ab61aa917cf8ed342867a0d3b31a35 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Dec 2019 17:59:52 +0100 Subject: [PATCH 020/187] More formatting changes --- fabfile.py | 6 ++---- spacy/__main__.py | 2 +- spacy/_ml.py | 2 +- spacy/cli/evaluate.py | 26 +++++++++++------------ spacy/cli/init_model.py | 4 ++-- spacy/cli/package.py | 2 +- spacy/cli/pretrain.py | 12 ++++------- spacy/cli/train.py | 10 ++++----- spacy/gold.pyx | 10 ++++----- spacy/kb.pyx | 2 +- spacy/lang/ca/tokenizer_exceptions.py | 4 ++-- spacy/lang/da/tokenizer_exceptions.py | 2 +- spacy/lang/el/tokenizer_exceptions.py | 8 +++---- spacy/lang/en/tokenizer_exceptions.py | 8 +++---- spacy/lang/es/tokenizer_exceptions.py | 4 ++-- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/language.py | 2 +- spacy/morphology.pyx | 2 +- spacy/pipeline/pipes.pyx | 4 ++-- spacy/syntax/nonproj.pyx | 3 +-- spacy/syntax/stateclass.pyx | 6 +++--- spacy/tests/conftest.py | 4 ++-- spacy/tests/pipeline/test_pipe_methods.py | 2 +- spacy/tokens/doc.pyx | 2 +- spacy/util.py | 4 ++-- 25 files changed, 63 insertions(+), 70 deletions(-) diff --git a/fabfile.py b/fabfile.py index 460471747..760c2c0e2 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1,6 +1,6 @@ import contextlib from pathlib import Path -from fabric.api import local, lcd, env, settings, prefix +from fabric.api import local, lcd from os import path, environ import shutil import sys @@ -79,9 +79,7 @@ def pex(): with virtualenv(VENV_DIR) as venv_local: with lcd(path.dirname(__file__)): sha = local("git rev-parse --short HEAD", capture=True) - venv_local( - "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True - ) + venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True) def clean(): diff --git a/spacy/__main__.py b/spacy/__main__.py index 06ba5704d..05e3d5e02 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -22,7 +22,7 @@ if __name__ == "__main__": if len(sys.argv) == 1: msg.info("Available commands", ", ".join(commands), exits=1) command = sys.argv.pop(1) - sys.argv[0] = "spacy %s" % command + sys.argv[0] = f"spacy {command}" if command in commands: plac.call(commands[command], sys.argv[1:]) else: diff --git a/spacy/_ml.py b/spacy/_ml.py index a1d2b6b77..37cfff0b7 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -296,7 +296,7 @@ def link_vectors_to_models(vocab): # This is a hack to avoid the problem in #3853. Maybe we should # print a warning as well? old_name = vectors.name - new_name = vectors.name + "_%d" % data.shape[0] + new_name = f"{vectors.name}_{data.shape[0]}" user_warning(Warnings.W019.format(old=old_name, new=new_name)) vectors.name = new_name key = (ops.device, vectors.name) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index de2cb4d09..a6b730d65 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -47,20 +47,20 @@ def evaluate( end = timer() nwords = sum(len(ex.doc) for ex in dev_dataset) results = { - "Time": "%.2f s" % (end - begin), + "Time": f"{end - begin:.2f} s", "Words": nwords, - "Words/s": "%.0f" % (nwords / (end - begin)), - "TOK": "%.2f" % scorer.token_acc, - "POS": "%.2f" % scorer.tags_acc, - "UAS": "%.2f" % scorer.uas, - "LAS": "%.2f" % scorer.las, - "NER P": "%.2f" % scorer.ents_p, - "NER R": "%.2f" % scorer.ents_r, - "NER F": "%.2f" % scorer.ents_f, - "Textcat": "%.2f" % scorer.textcat_score, - "Sent P": "%.2f" % scorer.sent_p, - "Sent R": "%.2f" % scorer.sent_r, - "Sent F": "%.2f" % scorer.sent_f, + "Words/s": f"{nwords / (end - begin):.0f}", + "TOK": f"{scorer.token_acc:.2f}", + "POS": f"{scorer.tags_acc:.2f}", + "UAS": f"{scorer.uas:.2f}", + "LAS": f"{scorer.las:.2f}", + "NER P": f"{scorer.ents_p:.2f}", + "NER R": f"{scorer.ents_r:.2f}", + "NER F": f"{scorer.ents_f:.2f}", + "Textcat": f"{scorer.textcat_score:.2f}", + "Sent P": f"{scorer.sent_p:.2f}", + "Sent R": f"{scorer.sent_r:.2f}", + "Sent F": f"{scorer.sent_f:.2f}", } msg.table(results, title="Results") diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index c3ef5267c..87583ba73 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -186,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if name is None: - nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] + nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors" else: nlp.vocab.vectors.name = name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name @@ -232,7 +232,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): word = literal_eval(key) except SyntaxError: # Take odd strings literally. - word = literal_eval("'%s'" % key) + word = literal_eval(f"'{key}'") smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8830a0ca2..edd9117c5 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -83,7 +83,7 @@ def generate_meta(model_path, existing_meta, msg): ("lang", "Model language", meta.get("lang", "en")), ("name", "Model name", meta.get("name", "model")), ("version", "Model version", meta.get("version", "0.0.0")), - ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__), + ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"), ("description", "Model description", meta.get("description", False)), ("author", "Author", meta.get("author", False)), ("email", "Author email", meta.get("email", False)), diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 75840923e..12aa8b5c2 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -179,14 +179,12 @@ def pretrain( else: if not epoch_start: msg.fail( - "You have to use the '--epoch-start' argument when using a renamed weight file for " - "'--init-tok2vec'", + "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec", exits=True, ) elif epoch_start < 0: msg.fail( - "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" - % epoch_start, + f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid", exits=True, ) else: @@ -195,16 +193,14 @@ def pretrain( optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) - msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): - with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( - "wb" - ) as file_: + with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e8662a101..df5456df3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -375,7 +375,7 @@ def train( words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) - epoch_model_path = output_path / ("model%d" % i) + epoch_model_path = output_path / f"model{i}" nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: @@ -414,13 +414,13 @@ def train( scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) - acc_loc = output_path / ("model%d" % i) / "accuracy.json" + acc_loc = output_path / f"model{i}" / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = ">=%s" % about.__version__ + meta["spacy_version"] = f">={about.__version__}" if beam_width == 1: meta["speed"] = { "nwords": nwords, @@ -443,10 +443,10 @@ def train( "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } - meta.setdefault("name", "model%d" % i) + meta.setdefault("name", f"model{i}") meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] - meta_loc = output_path / ("model%d" % i) / "meta.json" + meta_loc = output_path / f"model{i}" / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index e3af40d4d..1d3d8e034 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -615,7 +615,7 @@ def _consume_ent(tags): else: start = "B-" + label end = "L-" + label - middle = ["I-%s" % label for _ in range(1, length - 1)] + middle = [f"I-{label}" for _ in range(1, length - 1)] return [start] + middle + [end] @@ -1204,12 +1204,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): # Only interested if the tokenization is correct if start_token is not None and end_token is not None: if start_token == end_token: - biluo[start_token] = "U-%s" % label + biluo[start_token] = f"U-{label}" else: - biluo[start_token] = "B-%s" % label + biluo[start_token] = f"B-{label}" for i in range(start_token+1, end_token): - biluo[i] = "I-%s" % label - biluo[end_token] = "L-%s" % label + biluo[i] = f"I-{label}" + biluo[end_token] = f"L-{label}" # Now distinguish the O cases from ones where we miss the tokenization entity_chars = set() for start_char, end_char, label in entities: diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 1129fa860..64fbb1e29 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -442,7 +442,7 @@ cdef class KnowledgeBase: cdef class Writer: def __init__(self, object loc): if path.exists(loc): - assert not path.isdir(loc), "%s is directory." % loc + assert not path.isdir(loc), f"{loc} is directory" if isinstance(loc, Path): loc = bytes(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index 5a9d9055a..b4ae61a2d 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -30,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 64eba819f..c8ea9cbf5 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -559,7 +559,7 @@ for exc_data in [ # Dates for h in range(1, 31 + 1): for period in ["."]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] _custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]} _exc.update(_custom_base_exc) diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index 27ae1fe3a..112fd991b 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -131,14 +131,14 @@ _exc.update(_other_exc) for h in range(1, 12 + 1): for period in ["π.μ.", "πμ"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."}, ] for period in ["μ.μ.", "μμ"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."}, ] diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 776948c28..3e8075ec4 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -328,13 +328,13 @@ for exc_data in [ for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}, ] for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, + _exc[f"{h}{period}"] = [ + {ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}, ] diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 1cd5941be..5c7fcb15d 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -28,9 +28,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] for h in range(1, 12 + 1): for period in ["a.m.", "am"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}] + _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] for orth in [ diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index b1c0a53af..4e2e7fb18 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -85,7 +85,7 @@ for verb, verb_lemma in [("est", "être")]: for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for orth in [pre, pre.title()]: - _exc["%sest-ce" % orth] = [ + _exc[f"{orth}est-ce"] = [ {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, {LEMMA: "être", ORTH: "est", TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, diff --git a/spacy/language.py b/spacy/language.py index 4a553bcaf..4ae729588 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1015,7 +1015,7 @@ def _fix_pretrained_vectors_name(nlp): elif not nlp.vocab.vectors.size: nlp.vocab.vectors.name = None elif "name" in nlp.meta and "lang" in nlp.meta: - vectors_name = "%s_%s.vectors" % (nlp.meta["lang"], nlp.meta["name"]) + vectors_name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" nlp.vocab.vectors.name = vectors_name else: raise ValueError(Errors.E092) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index f12691170..8030a9a28 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -72,7 +72,7 @@ def _normalize_props(props): # just take the first one :( if "|" in value: value = value.split("|")[0] - attr = '%s_%s' % (key, value) + attr = f"{key}_{value}" if attr in FEATURES: props.pop(key) props[attr] = True diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ff88340cd..5ca651077 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -985,14 +985,14 @@ class MultitaskObjective(Tagger): offset = token_annotation.heads[i] - i offset = min(offset, 2) offset = max(offset, -2) - return "%s-%s:%d" % (token_annotation.deps[i], token_annotation.tags[i], offset) + return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}" @staticmethod def make_ent_tag(i, token_annotation): if token_annotation.entities is None or token_annotation.entities[i] is None: return None else: - return "%s-%s" % (token_annotation.tags[i], token_annotation.entities[i]) + return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}" @staticmethod def make_sent_start(target, token_annotation, cache=True, _cache={}): diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 0f738f99f..f024c1f05 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -154,8 +154,7 @@ def _decorate(heads, proj_heads, labels): deco_labels = [] for tokenid, head in enumerate(heads): if head != proj_heads[tokenid]: - deco_labels.append( - '%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) + deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}") else: deco_labels.append(labels[tokenid]) return deco_labels diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 47b37946c..e472e9861 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -46,9 +46,9 @@ cdef class StateClass: def print_state(self, words): words = list(words) + ['_'] - top = words[self.S(0)] + '_%d' % self.S_(0).head - second = words[self.S(1)] + '_%d' % self.S_(1).head - third = words[self.S(2)] + '_%d' % self.S_(2).head + top = f"{words[self.S(0)]}_{self.S_(0).head}" + second = f"{words[self.S(1)]}_{self.S_(1).head}" + third = f"{words[self.S(2)]}_{self.S_(2).head}" n0 = words[self.B(0)] n1 = words[self.B(1)] return ' '.join((third, second, top, '|', n0, n1)) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ba7b67e25..b391dd88e 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -14,11 +14,11 @@ def pytest_runtest_setup(item): # recognize the option we're asking about. To avoid this, we need to # pass a default value. We default to False, i.e., we act like all the # options weren't given. - return item.config.getoption("--%s" % opt, False) + return item.config.getoption(f"--{opt}", False) for opt in ["slow"]: if opt in item.keywords and not getopt(opt): - pytest.skip("need --%s option to run" % opt) + pytest.skip(f"need --{opt} option to run") # Fixtures for language tokenizers (languages sorted alphabetically) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 3ec8b508d..e2fb02a2a 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -115,7 +115,7 @@ def test_disable_pipes_list_arg(nlp): @pytest.mark.parametrize("n_pipes", [100]) def test_add_lots_of_pipes(nlp, n_pipes): for i in range(n_pipes): - nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i) + nlp.add_pipe(lambda doc: doc, name=f"pipe_{i}") assert len(nlp.pipe_names) == n_pipes diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 58423c420..7e6473d56 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -499,7 +499,7 @@ cdef class Doc: token = &self.c[i] if token.ent_iob == 1: if start == -1: - seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]] + seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] raise ValueError(Errors.E093.format(seq=" ".join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0: if start != -1: diff --git a/spacy/util.py b/spacy/util.py index 57bbee69f..55e197eb2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -62,7 +62,7 @@ def get_lang_class(lang): return registry.languages.get(lang) else: try: - module = importlib.import_module(".lang.%s" % lang, "spacy") + module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: raise ImportError(Errors.E048.format(lang=lang, err=err)) set_lang_class(lang, getattr(module, module.__all__[0])) @@ -212,7 +212,7 @@ def load_model_from_init_py(init_file, **overrides): """ model_path = Path(init_file).parent meta = get_model_meta(model_path) - data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"]) + data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}" data_path = model_path / data_dir if not model_path.exists(): raise IOError(Errors.E052.format(path=data_path)) From 401946d480d2841139c2b8986d900da0d5e12e40 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Dec 2019 18:02:20 +0100 Subject: [PATCH 021/187] Un-xfail passing tests --- spacy/tests/lang/en/test_prefix_suffix_infix.py | 1 - spacy/tests/lang/en/test_punct.py | 1 - spacy/tests/lang/ru/test_tokenizer.py | 1 - spacy/tests/lang/sr/test_tokenizer.py | 1 - 4 files changed, 4 deletions(-) diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 8c9c58fea..9efcc1015 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -108,7 +108,6 @@ def test_en_tokenizer_splits_double_hyphen_infix(en_tokenizer): assert tokens[9].text == "people" -@pytest.mark.xfail def test_en_tokenizer_splits_period_abbr(en_tokenizer): text = "Today is Tuesday.Mr." tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py index 4dc6ddfe4..1d10478a1 100644 --- a/spacy/tests/lang/en/test_punct.py +++ b/spacy/tests/lang/en/test_punct.py @@ -79,7 +79,6 @@ def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail @pytest.mark.parametrize("text", ["Hello''"]) def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text): tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py index e05a479aa..1cfdc50ee 100644 --- a/spacy/tests/lang/ru/test_tokenizer.py +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -77,7 +77,6 @@ def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail @pytest.mark.parametrize("text", ["Тест''"]) def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text): tokens = ru_tokenizer(text) diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py index 03a0470bd..fdcf790d8 100644 --- a/spacy/tests/lang/sr/test_tokenizer.py +++ b/spacy/tests/lang/sr/test_tokenizer.py @@ -77,7 +77,6 @@ def test_sr_tokenizer_splits_open_appostrophe(sr_tokenizer, text): assert tokens[0].text == "'" -@pytest.mark.xfail @pytest.mark.parametrize("text", ["Тест''"]) def test_sr_tokenizer_splits_double_end_quote(sr_tokenizer, text): tokens = sr_tokenizer(text) From 83e0a6f3e3bc21e32d95cbe8fcf2f8dd4fa76c65 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 1 Jan 2020 13:15:46 +0100 Subject: [PATCH 022/187] Modernize plac commands for Python 3 (#4836) --- spacy/cli/convert.py | 37 +++++---------- spacy/cli/debug_data.py | 34 +++++-------- spacy/cli/download.py | 12 ++--- spacy/cli/evaluate.py | 26 ++++------ spacy/cli/info.py | 12 ++--- spacy/cli/init_model.py | 37 +++++---------- spacy/cli/link.py | 13 +++-- spacy/cli/package.py | 18 +++---- spacy/cli/pretrain.py | 103 ++++++++++------------------------------ spacy/cli/profile.py | 14 +++--- spacy/cli/train.py | 91 ++++++++++++----------------------- 11 files changed, 129 insertions(+), 268 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index d8c8a7a18..31931db68 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,4 +1,3 @@ -import plac from pathlib import Path from wasabi import Printer import srsly @@ -26,31 +25,19 @@ FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES_STDOUT = ("json", "jsonl") -@plac.annotations( - # fmt: off - input_file=("Input file", "positional", None, str), - output_dir=("Output directory. '-' for stdout.", "positional", None, str), - file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES), - n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int), - seg_sents=("Segment sentences (for -c ner)", "flag", "s"), - model=("Model for sentence segmentation (for -s)", "option", "b", str), - converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str), - lang=("Language (if tokenizer required)", "option", "l", str), - morphology=("Enable appending morphology to tags", "flag", "m", bool), - ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,), - # fmt: on -) def convert( - input_file, - output_dir="-", - file_type="json", - n_sents=1, - seg_sents=False, - model=None, - morphology=False, - converter="auto", - ner_map_path=None, - lang=None, + # fmt: off + input_file: ("Input file", "positional", None, str), + output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", + file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", + n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, + seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, + model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, + morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, + converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", + ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, + lang: ("Language (if tokenizer required)", "option", "l", str) = None, + # fmt: on ): """ Convert files into JSON format for use with train command and other diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 2e780f53c..c894788cb 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,6 +1,5 @@ from pathlib import Path from collections import Counter -import plac import sys import srsly from wasabi import Printer, MESSAGES @@ -19,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 -@plac.annotations( - # fmt: off - lang=("model language", "positional", None, str), - train_path=("location of JSON-formatted training data", "positional", None, Path), - dev_path=("location of JSON-formatted development data", "positional", None, Path), - tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), - base_model=("name of model to update (optional)", "option", "b", str), - pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), - ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), - verbose=("Print additional information and explanations", "flag", "V", bool), - no_format=("Don't pretty-print the results", "flag", "NF", bool), - # fmt: on -) def debug_data( - lang, - train_path, - dev_path, - tag_map_path=None, - base_model=None, - pipeline="tagger,parser,ner", - ignore_warnings=False, - verbose=False, - no_format=False, + # fmt: off + lang: ("Model language", "positional", None, str), + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + base_model: ("Name of model to update (optional)", "option", "b", str) = None, + pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner", + ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False, + verbose: ("Print additional information and explanations", "flag", "V", bool) = False, + no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False, + # fmt: on ): """ Analyze, debug and validate your training and development data, get useful diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 7c87a582a..7388bf615 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,4 +1,3 @@ -import plac import requests import os import subprocess @@ -10,12 +9,11 @@ from ..util import get_package_path from .. import about -@plac.annotations( - model=("Model to download (shortcut or name)", "positional", None, str), - direct=("Force direct download of name + version", "flag", "d", bool), - pip_args=("Additional arguments to be passed to `pip install` on model install"), -) -def download(model, direct=False, *pip_args): +def download( + model: ("Model to download (shortcut or name)", "positional", None, str), + direct: ("Force direct download of name + version", "flag", "d", bool) = False, + *pip_args: ("Additional arguments to be passed to `pip install` on model install"), +): """ Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index a6b730d65..e047f1283 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,4 +1,3 @@ -import plac from timeit import default_timer as timer from wasabi import msg @@ -7,23 +6,16 @@ from .. import util from .. import displacy -@plac.annotations( - model=("Model name or path", "positional", None, str), - data_path=("Location of JSON-formatted evaluation data", "positional", None, str), - gold_preproc=("Use gold preprocessing", "flag", "G", bool), - gpu_id=("Use GPU", "option", "g", int), - displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), - displacy_limit=("Limit of parses to render as HTML", "option", "dl", int), - return_scores=("Return dict containing model scores", "flag", "R", bool), -) def evaluate( - model, - data_path, - gpu_id=-1, - gold_preproc=False, - displacy_path=None, - displacy_limit=25, - return_scores=False, + # fmt: off + model: ("Model name or path", "positional", None, str), + data_path: ("Location of JSON-formatted evaluation data", "positional", None, str), + gpu_id: ("Use GPU", "option", "g", int) = -1, + gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, + displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None, + displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25, + return_scores: ("Return dict containing model scores", "flag", "R", bool) = False, + # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 060a38e78..fc8764ca8 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,4 +1,3 @@ -import plac import platform from pathlib import Path from wasabi import msg @@ -8,12 +7,11 @@ from .. import util from .. import about -@plac.annotations( - model=("Optional shortcut link of model", "positional", None, str), - markdown=("Generate Markdown for GitHub issues", "flag", "md", str), - silent=("Don't print anything (just return)", "flag", "s"), -) -def info(model=None, markdown=False, silent=False): +def info( + model: ("Optional shortcut link of model", "positional", None, str) = None, + markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, + silent: ("Don't print anything (just return)", "flag", "s") = False, +): """ Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 87583ba73..babef106c 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,4 +1,3 @@ -import plac import math from tqdm import tqdm import numpy @@ -24,32 +23,18 @@ except ImportError: DEFAULT_OOV_PROB = -20 -@plac.annotations( - lang=("Model language", "positional", None, str), - output_dir=("Model output directory", "positional", None, Path), - freqs_loc=("Location of words frequencies file", "option", "f", Path), - jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path), - clusters_loc=("Optional location of brown clusters data", "option", "c", str), - vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), - prune_vectors=("Optional number of vectors to prune to", "option", "V", int), - vectors_name=( - "Optional name for the word vectors, e.g. en_core_web_lg.vectors", - "option", - "vn", - str, - ), - model_name=("Optional name for the model meta", "option", "mn", str), -) def init_model( - lang, - output_dir, - freqs_loc=None, - clusters_loc=None, - jsonl_loc=None, - vectors_loc=None, - prune_vectors=-1, - vectors_name=None, - model_name=None, + # fmt: off + lang: ("Model language", "positional", None, str), + output_dir: ("Model output directory", "positional", None, Path), + freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None, + clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None, + jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None, + vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None, + prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1, + vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None, + model_name: ("Optional name for the model meta", "option", "mn", str) = None, + # fmt: on ): """ Create a new model from raw data, like word frequencies, Brown clusters diff --git a/spacy/cli/link.py b/spacy/cli/link.py index df24adc23..d8af469dc 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -1,4 +1,3 @@ -import plac from pathlib import Path from wasabi import msg @@ -6,12 +5,12 @@ from ..compat import symlink_to from .. import util -@plac.annotations( - origin=("package name or local path to model", "positional", None, str), - link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool), -) -def link(origin, link_name, force=False, model_path=None): +def link( + origin: ("package name or local path to model", "positional", None, str), + link_name: ("name of shortuct link to create", "positional", None, str), + force: ("force overwriting of existing link", "flag", "f", bool) = False, + model_path=None, +): """ Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data diff --git a/spacy/cli/package.py b/spacy/cli/package.py index edd9117c5..8e27e44d0 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,4 +1,3 @@ -import plac import shutil from pathlib import Path from wasabi import msg, get_raw_input @@ -8,14 +7,15 @@ from .. import util from .. import about -@plac.annotations( - input_dir=("Directory with model data", "positional", None, str), - output_dir=("Output parent directory", "positional", None, str), - meta_path=("Path to meta.json", "option", "m", str), - create_meta=("Create meta.json, even if one exists", "flag", "c", bool), - force=("Force overwriting existing model in output directory", "flag", "f", bool), -) -def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): +def package( + # fmt: off + input_dir: ("Directory with model data", "positional", None, str), + output_dir: ("Output parent directory", "positional", None, str), + meta_path: ("Path to meta.json", "option", "m", str) = None, + create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False, + force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False, + # fmt: on +): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 12aa8b5c2..9e2fc5b1c 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,4 +1,3 @@ -import plac import random import numpy import time @@ -21,85 +20,31 @@ from .. import util from .train import _load_pretrained_tok2vec -@plac.annotations( - texts_loc=( - "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the " - "key 'tokens'", - "positional", - None, - str, - ), - vectors_model=("Name or path to spaCy model with vectors to learn from"), - output_dir=("Directory to write models to on each epoch", "positional", None, str), - width=("Width of CNN layers", "option", "cw", int), - depth=("Depth of CNN layers", "option", "cd", int), - cnn_window=("Window size for CNN layers", "option", "cW", int), - cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), - use_chars=("Whether to use character-based embedding", "flag", "chr", bool), - sa_depth=("Depth of self-attention layers", "option", "sa", int), - bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), - embed_rows=("Number of embedding rows", "option", "er", int), - loss_func=( - "Loss function to use for the objective. Either 'L2' or 'cosine'", - "option", - "L", - str, - ), - use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), - dropout=("Dropout rate", "option", "d", float), - batch_size=("Number of words per training batch", "option", "bs", int), - max_length=( - "Max words per example. Longer examples are discarded", - "option", - "xw", - int, - ), - min_length=( - "Min words per example. Shorter examples are discarded", - "option", - "nw", - int, - ), - seed=("Seed for random number generators", "option", "s", int), - n_iter=("Number of iterations to pretrain", "option", "i", int), - n_save_every=("Save model every X batches.", "option", "se", int), - init_tok2vec=( - "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", - "option", - "t2v", - Path, - ), - epoch_start=( - "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been " - "renamed. Prevents unintended overwriting of existing weight files.", - "option", - "es", - int, - ), -) def pretrain( - texts_loc, - vectors_model, - output_dir, - width=96, - depth=4, - bilstm_depth=0, - cnn_pieces=3, - sa_depth=0, - use_chars=False, - cnn_window=1, - embed_rows=2000, - loss_func="cosine", - use_vectors=False, - dropout=0.2, - n_iter=1000, - batch_size=3000, - max_length=500, - min_length=5, - seed=0, - n_save_every=None, - init_tok2vec=None, - epoch_start=None, + # fmt: off + texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), + vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str), + output_dir: ("Directory to write models to on each epoch", "positional", None, str), + width: ("Width of CNN layers", "option", "cw", int) = 96, + depth: ("Depth of CNN layers", "option", "cd", int) = 4, + bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0, + cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3, + sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0, + use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False, + cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1, + embed_rows: ("Number of embedding rows", "option", "er", int) = 2000, + loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine", + use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False, + dropout: ("Dropout rate", "option", "d", float) = 0.2, + n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000, + batch_size: ("Number of words per training batch", "option", "bs", int) = 3000, + max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500, + min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5, + seed: ("Seed for random number generators", "option", "s", int) = 0, + n_save_every: ("Save model every X batches.", "option", "se", int) = None, + init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None, + # fmt: on ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index f3df0817d..44e59971a 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,4 +1,3 @@ -import plac import tqdm from pathlib import Path import srsly @@ -12,12 +11,13 @@ from wasabi import msg from ..util import load_model -@plac.annotations( - model=("Model to load", "positional", None, str), - inputs=("Location of input file. '-' for stdin.", "positional", None, str), - n_texts=("Maximum number of texts to use if available", "option", "n", int), -) -def profile(model, inputs=None, n_texts=10000): +def profile( + # fmt: off + model: ("Model to load", "positional", None, str), + inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None, + n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000, + # fmt: on +): """ Profile a spaCy pipeline, to find out which functions take the most time. Input should be formatted as one JSON object per line with a key "text". diff --git a/spacy/cli/train.py b/spacy/cli/train.py index df5456df3..454403529 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,3 @@ -import plac import os import tqdm from pathlib import Path @@ -17,67 +16,37 @@ from .. import util from .. import about -@plac.annotations( - # fmt: off - lang=("Model language", "positional", None, str), - output_path=("Output directory to store model in", "positional", None, Path), - train_path=("Location of JSON-formatted training data", "positional", None, Path), - dev_path=("Location of JSON-formatted development data", "positional", None, Path), - raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), - base_model=("Name of model to update (optional)", "option", "b", str), - pipeline=("Comma-separated names of pipeline components", "option", "p", str), - vectors=("Model to load vectors from", "option", "v", str), - n_iter=("Number of iterations", "option", "n", int), - n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), - n_examples=("Number of examples", "option", "ns", int), - use_gpu=("Use GPU", "option", "g", int), - version=("Model version", "option", "V", str), - meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), - init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path), - parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str), - entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str), - noise_level=("Amount of corruption for data augmentation", "option", "nl", float), - orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), - eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), - gold_preproc=("Use gold preprocessing", "flag", "G", bool), - learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), - textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), - textcat_arch=("Textcat model architecture", "option", "ta", str), - textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), - tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), - verbose=("Display more information for debug", "flag", "VV", bool), - debug=("Run data diagnostics before training", "flag", "D", bool), - # fmt: on -) def train( - lang, - output_path, - train_path, - dev_path, - raw_text=None, - base_model=None, - pipeline="tagger,parser,ner", - vectors=None, - n_iter=30, - n_early_stopping=None, - n_examples=0, - use_gpu=-1, - version="0.0.0", - meta_path=None, - init_tok2vec=None, - parser_multitasks="", - entity_multitasks="", - noise_level=0.0, - orth_variant_level=0.0, - eval_beam_widths="", - gold_preproc=False, - learn_tokens=False, - textcat_multilabel=False, - textcat_arch="bow", - textcat_positive_label=None, - tag_map_path=None, - verbose=False, - debug=False, + # fmt: off + lang: ("Model language", "positional", None, str), + output_path: ("Output directory to store model in", "positional", None, Path), + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, + base_model: ("Name of model to update (optional)", "option", "b", str) = None, + pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner", + vectors: ("Model to load vectors from", "option", "v", str) = None, + n_iter: ("Number of iterations", "option", "n", int) = 30, + n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None, + n_examples: ("Number of examples", "option", "ns", int) = 0, + use_gpu: ("Use GPU", "option", "g", int) = -1, + version: ("Model version", "option", "V", str) = "0.0.0", + meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None, + init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "", + entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "", + noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0, + orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0, + eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "", + gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, + learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False, + textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False, + textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow", + textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None, + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + verbose: ("Display more information for debug", "flag", "VV", bool) = False, + debug: ("Run data diagnostics before training", "flag", "D", bool) = False, + # fmt: on ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's From 581eeed98b7a5a4565ea8286ef78bfce01667535 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 1 Jan 2020 13:16:48 +0100 Subject: [PATCH 023/187] Warning goldparse (#4851) * label in span not writable anymore * Revert "label in span not writable anymore" This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090. * provide more friendly error msg for parsing file --- spacy/errors.py | 1 + spacy/gold.pyx | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 3aa4bedea..7393ddc07 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -534,6 +534,7 @@ class Errors(object): "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") # TODO: fix numbering after merging develop into master + E996 = ("Could not parse {file}: {msg}") E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1d3d8e034..10b8bf0cf 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -194,9 +194,10 @@ class GoldCorpus(object): i = 0 for loc in locs: loc = util.ensure_path(loc) - if loc.parts[-1].endswith("json"): + file_name = loc.parts[-1] + if file_name.endswith("json"): examples = read_json_file(loc) - elif loc.parts[-1].endswith("jsonl"): + elif file_name.endswith("jsonl"): gold_tuples = srsly.read_jsonl(loc) first_gold_tuple = next(gold_tuples) gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) @@ -212,17 +213,24 @@ class GoldCorpus(object): doc = ex_dict.get("text", None) examples.append(Example.from_dict(ex_dict, doc=doc)) - elif loc.parts[-1].endswith("msg"): + elif file_name.endswith("msg"): text, ex_dict = srsly.read_msgpack(loc) examples = [Example.from_dict(ex_dict, doc=text)] else: supported = ("json", "jsonl", "msg") raise ValueError(Errors.E124.format(path=loc, formats=supported)) - for example in examples: - yield example - i += 1 - if limit and i >= limit: - return + try: + for example in examples: + yield example + i += 1 + if limit and i >= limit: + return + except KeyError as e: + msg = "Missing key {}".format(e) + raise KeyError(Errors.E996.format(file=file_name, msg=msg)) + except UnboundLocalError as e: + msg = "Unexpected document structure" + raise ValueError(Errors.E996.format(file=file_name, msg=msg)) @property def dev_examples(self): From e1b493ae8521af36fd1f0dfeafe7d9eb0408fe75 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Jan 2020 16:51:24 +0100 Subject: [PATCH 024/187] Add sentrec shortcut to Language (#4890) --- spacy/language.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 4ae729588..b91903595 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -225,6 +225,10 @@ class Language(object): def linker(self): return self.get_pipe("entity_linker") + @property + def sentrec(self): + return self.get_pipe("sentrec") + @property def matcher(self): return self.get_pipe("matcher") From e55fa1899aa8bae311064004d0edaed8b37979e5 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Jan 2020 16:51:51 +0100 Subject: [PATCH 025/187] Report length of dev dataset correctly (#4891) --- spacy/cli/debug_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c894788cb..8c77f7356 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -107,7 +107,7 @@ def debug_data( else: msg.text(f"Starting with blank model '{lang}'") msg.text(f"{len(train_dataset)} training docs") - msg.text(f"{len(gold_dev_data)} evaluation docs") + msg.text(f"{len(dev_dataset)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") From d2f3a44b42bfff9773fdf3abaccdcc0e78d295f7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Jan 2020 16:52:14 +0100 Subject: [PATCH 026/187] Improve train CLI sentrec scoring (#4892) * reorder to metrics to prioritize F over P/R * add sentrec to model metrics --- spacy/cli/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 454403529..6ebf5d37d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -476,6 +476,8 @@ def _score_for_model(meta): mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) if "textcat" in pipes: mean_acc.append(acc["textcat_score"]) + if "sentrec" in pipes: + mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3) return sum(mean_acc) / len(mean_acc) @@ -554,7 +556,7 @@ def _get_metrics(component): elif component == "ner": return ("ents_f", "ents_p", "ents_r") elif component == "sentrec": - return ("sent_p", "sent_r", "sent_f",) + return ("sent_f", "sent_p", "sent_r") return ("token_acc",) From 199d89943e546eefb76656ed933ca6ab34296662 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 22 Jan 2020 15:40:31 +0100 Subject: [PATCH 027/187] Add as_example to Sentencizer pipe() (#4933) --- spacy/pipeline/pipes.pyx | 24 ++++++++++++++++++------ spacy/tests/pipeline/test_sentencizer.py | 6 ++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 5ca651077..4f0f2469e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1711,12 +1711,24 @@ class Sentencizer(Pipe): return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) - tag_ids = self.predict(docs) - self.set_annotations(docs, tag_ids) - yield from docs + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] + predictions = self.predict(docs) + if isinstance(predictions, tuple) and len(tuple) == 2: + scores, tensors = predictions + self.set_annotations(docs, scores, tensors=tensors) + else: + self.set_annotations(docs, predictions) + + if as_example: + annotated_examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + annotated_examples.append(ex) + yield from annotated_examples + else: + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 78ab6d2d1..5f9c55dbb 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -24,6 +24,12 @@ def test_sentencizer_pipe(): sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2 + for ex in nlp.pipe(texts, as_example=True): + doc = ex.doc + assert doc.is_sentenced + sent_starts = [t.is_sent_start for t in doc] + assert sent_starts == [True, False, True, False, False, False, False] + assert len(list(doc.sents)) == 2 @pytest.mark.parametrize( From 0a0de85409e796d37b3b74796e2475c37d131c1b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 23 Jan 2020 22:00:24 +0100 Subject: [PATCH 028/187] Fix gold training (#4938) * label in span not writable anymore * Revert "label in span not writable anymore" This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090. * ensure doc is not None --- spacy/gold.pyx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 10b8bf0cf..0dfa32c84 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -326,14 +326,18 @@ class GoldCorpus(object): @classmethod def _make_golds(cls, examples, vocab=None, make_projective=False, ignore_misaligned=False): + filtered_examples = [] for example in examples: gold_parses = example.get_gold_parses(vocab=vocab, make_projective=make_projective, ignore_misaligned=ignore_misaligned) assert len(gold_parses) == 1 - assert gold_parses[0][0] == example.doc - example.goldparse = gold_parses[0][1] - return examples + doc, gold = gold_parses[0] + if doc: + assert doc == example.doc + example.goldparse = gold + filtered_examples.append(example) + return filtered_examples def make_orth_variants(nlp, example, orth_variant_level=0.0): From adc974571803f984d73b27d70320c4856001a4fd Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 23 Jan 2020 22:01:54 +0100 Subject: [PATCH 029/187] Modify morphology to support arbitrary features (#4932) * Restructure tag maps for MorphAnalysis changes Prepare tag maps for upcoming MorphAnalysis changes that allow arbritrary features. * Use default tag map rather than duplicating for ca / uk / vi * Import tag map into defaults for ga * Modify tag maps so all morphological fields and features are strings * Move features from `"Other"` to the top level * Rewrite tuples as strings separated by `","` * Rewrite morph symbols for fr lemmatizer as strings * Export MorphAnalysis under spacy.tokens * Modify morphology to support arbitrary features Modify `Morphology` and `MorphAnalysis` so that arbitrary features are supported. * Modify `MorphAnalysisC` so that it can support arbitrary features and multiple values per field. `MorphAnalysisC` is redesigned to contain: * key: hash of UD FEATS string of morphological features * array of `MorphFeatureC` structs that each contain a hash of `Field` and `Field=Value` for a given morphological feature, which makes it possible to: * find features by field * represent multiple values for a given field * `get_field()` is renamed to `get_by_field()` and is no longer `nogil`. Instead a new helper function `get_n_by_field()` is `nogil` and returns `n` features by field. * `MorphAnalysis.get()` returns all possible values for a field as a list of individual features such as `["Tense=Pres", "Tense=Past"]`. * `MorphAnalysis`'s `str()` and `repr()` are the UD FEATS string. * `Morphology.feats_to_dict()` converts a UD FEATS string to a dict where: * Each field has one entry in the dict * Multiple values remain separated by a separator in the value string * `Token.morph_` returns the UD FEATS string and you can set `Token.morph_` with a UD FEATS string or with a tag map dict. * Modify get_by_field to use np.ndarray Modify `get_by_field()` to use np.ndarray. Remove `max_results` from `get_n_by_field()` and always iterate over all the fields. * Rewrite without MorphFeatureC * Add shortcut for existing feats strings as keys Add shortcut for existing feats strings as keys in `Morphology.add()`. * Check for '_' as empty analysis when adding morphs * Extend helper converters in Morphology Add and extend helper converters that convert and normalize between: * UD FEATS strings (`"Case=dat,gen|Number=sing"`) * per-field dict of feats (`{"Case": "dat,gen", "Number": "sing"}`) * list of individual features (`["Case=dat", "Case=gen", "Number=sing"]`) All converters sort fields and values where applicable. --- spacy/errors.py | 3 + spacy/lang/bn/tag_map.py | 4 +- spacy/lang/ca/tag_map.py | 25 - spacy/lang/da/__init__.py | 2 - spacy/lang/el/tag_map.py | 64 +- spacy/lang/fr/lemmatizer.py | 9 +- spacy/lang/ga/__init__.py | 2 + spacy/lang/ga/tag_map.py | 400 +++---- spacy/lang/nb/morph_rules.py | 18 +- spacy/lang/sv/morph_rules.py | 10 +- spacy/lang/uk/tag_map.py | 25 - spacy/lang/ur/tag_map.py | 6 +- spacy/lang/vi/tag_map.py | 25 - spacy/morphology.pxd | 15 +- spacy/morphology.pyx | 1056 +++-------------- spacy/structs.pxd | 49 +- spacy/symbols.pxd | 552 ++++----- spacy/symbols.pyx | 552 ++++----- spacy/tests/doc/test_morphanalysis.py | 50 +- .../tests/morphology/test_morph_converters.py | 26 + spacy/tests/morphology/test_morph_features.py | 28 +- spacy/tests/regression/test_issue1-1000.py | 4 +- spacy/tests/regression/test_issue1001-1500.py | 4 +- spacy/tokens/__init__.py | 3 +- spacy/tokens/morphanalysis.pxd | 2 +- spacy/tokens/morphanalysis.pyx | 392 +----- spacy/tokens/token.pyx | 8 + 27 files changed, 1080 insertions(+), 2254 deletions(-) delete mode 100644 spacy/lang/ca/tag_map.py delete mode 100644 spacy/lang/uk/tag_map.py delete mode 100644 spacy/lang/vi/tag_map.py create mode 100644 spacy/tests/morphology/test_morph_converters.py diff --git a/spacy/errors.py b/spacy/errors.py index 7393ddc07..e00df2c51 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -103,6 +103,9 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") + W028 = ("Skipping unsupported morphological feature(s): {feature}. " + "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " + "string \"Field1=Value1,Value2|Field2=Value3\".") @add_codes diff --git a/spacy/lang/bn/tag_map.py b/spacy/lang/bn/tag_map.py index 36d69ccf9..bc4c5ef6b 100644 --- a/spacy/lang/bn/tag_map.py +++ b/spacy/lang/bn/tag_map.py @@ -11,8 +11,8 @@ TAG_MAP = { '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, ":": {POS: PUNCT}, - "৳": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "৳": {POS: SYM, "SymType": "currency"}, + "#": {POS: SYM, "SymType": "numbersign"}, "AFX": {POS: ADJ, "Hyph": "yes"}, "CC": {POS: CONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, diff --git a/spacy/lang/ca/tag_map.py b/spacy/lang/ca/tag_map.py deleted file mode 100644 index 1ecbddc49..000000000 --- a/spacy/lang/ca/tag_map.py +++ /dev/null @@ -1,25 +0,0 @@ -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE}, -} diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 2828c014b..6d1e33986 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -4,7 +4,6 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES -from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -24,7 +23,6 @@ class DanishDefaults(Language.Defaults): morph_rules = MORPH_RULES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py index adfacd025..f37f84c57 100644 --- a/spacy/lang/el/tag_map.py +++ b/spacy/lang/el/tag_map.py @@ -656,7 +656,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Plur", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFePlGe": { POS: DET, @@ -664,7 +664,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Plur", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFePlNm": { POS: DET, @@ -672,7 +672,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Plur", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgAc": { POS: DET, @@ -680,7 +680,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgDa": { POS: DET, @@ -688,7 +688,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgGe": { POS: DET, @@ -696,7 +696,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfFeSgNm": { POS: DET, @@ -704,7 +704,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaPlAc": { POS: DET, @@ -712,7 +712,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Plur", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaPlGe": { POS: DET, @@ -720,7 +720,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Plur", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaPlNm": { POS: DET, @@ -728,7 +728,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Plur", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgAc": { POS: DET, @@ -736,7 +736,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgDa": { POS: DET, @@ -744,7 +744,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgGe": { POS: DET, @@ -752,7 +752,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfMaSgNm": { POS: DET, @@ -760,7 +760,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlAc": { POS: DET, @@ -768,7 +768,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlDa": { POS: DET, @@ -776,7 +776,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlGe": { POS: DET, @@ -784,7 +784,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNePlNm": { POS: DET, @@ -792,7 +792,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Plur", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgAc": { POS: DET, @@ -800,7 +800,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgDa": { POS: DET, @@ -808,7 +808,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgGe": { POS: DET, @@ -816,7 +816,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtDfNeSgNm": { POS: DET, @@ -824,7 +824,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Def"}, + "Definite": "Def", }, "AtIdFeSgAc": { POS: DET, @@ -832,7 +832,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdFeSgDa": { POS: DET, @@ -840,7 +840,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Dat", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdFeSgGe": { POS: DET, @@ -848,7 +848,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdFeSgNm": { POS: DET, @@ -856,7 +856,7 @@ TAG_MAP = { "Gender": "Fem", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdMaSgAc": { POS: DET, @@ -864,7 +864,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdMaSgGe": { POS: DET, @@ -872,7 +872,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdMaSgNm": { POS: DET, @@ -880,7 +880,7 @@ TAG_MAP = { "Gender": "Masc", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdNeSgAc": { POS: DET, @@ -888,7 +888,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Acc", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdNeSgGe": { POS: DET, @@ -896,7 +896,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Gen", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "AtIdNeSgNm": { POS: DET, @@ -904,7 +904,7 @@ TAG_MAP = { "Gender": "Neut", "Number": "Sing", "Case": "Nom", - "Other": {"Definite": "Ind"}, + "Definite": "Ind", }, "CjCo": {POS: CCONJ}, "CjSb": {POS: SCONJ}, diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index 84e55d509..fe128df1f 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,7 +1,6 @@ from ...lemmatizer import Lemmatizer from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import SCONJ, CCONJ -from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class FrenchLemmatizer(Lemmatizer): @@ -82,13 +81,13 @@ class FrenchLemmatizer(Lemmatizer): return True elif univ_pos == "adj" and morphology.get("Degree") == "pos": return True - elif VerbForm_inf in morphology: + elif "VerbForm=inf" in morphology: return True - elif VerbForm_none in morphology: + elif "VerbForm=none" in morphology: return True - elif Number_sing in morphology: + elif "Number=sing" in morphology: return True - elif Degree_pos in morphology: + elif "Degree=pos" in morphology: return True else: return False diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index cea7c0e94..4c3d219c7 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,5 +1,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language @@ -13,6 +14,7 @@ class IrishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) + tag_map = TAG_MAP class Irish(Language): diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py index baf64c1b8..efcaf5d1f 100644 --- a/spacy/lang/ga/tag_map.py +++ b/spacy/lang/ga/tag_map.py @@ -1,26 +1,26 @@ # fmt: off TAG_MAP = { - "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"}, "ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"}, "ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"}, - "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}}, - "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}}, - "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "NounType": "strong"}, + "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "NounType": "weak"}, + "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"}, "ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, "ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, "ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, "ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, - "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}}, - "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}}, - "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}}, + "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "NounType": "notslender"}, + "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "NounType": "slender"}, + "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Form": "len"}, "ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"}, - "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}}, - "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}}, - "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}}, + "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Form": "ecl"}, + "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Form": "hpref"}, + "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Form": "len"}, "ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"}, "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"}, - "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}}, + "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Form": "len"}, "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, "ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"}, "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"}, @@ -29,9 +29,9 @@ TAG_MAP = { "ADJ___": {"pos": "ADJ"}, "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"}, "ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"}, - "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}}, - "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, - "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, + "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Form": "len"}, + "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Form": "len"}, + "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Form": "len"}, "ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3}, "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"}, @@ -57,41 +57,41 @@ TAG_MAP = { "ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"}, "ADP___": {"pos": "ADP"}, "ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"}, - "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}}, + "ADP__PrepForm=Cmpd": {"pos": "ADP", "PrepForm": "cmpd"}, "ADP__PronType=Art": {"pos": "ADP", "PronType": "art"}, - "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}}, + "ADV__Form=Len": {"pos": "ADV", "Form": "len"}, "ADV___": {"pos": "ADV"}, "ADV__PronType=Int": {"pos": "ADV", "PronType": "int"}, - "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}}, - "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}}, - "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}}, - "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}}, - "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, - "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}}, + "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Form": "vf", "VerbForm": "cop"}, + "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Form": "vf", "VerbForm": "cop"}, + "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "VerbForm": "cop"}, + "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Mood": "int", "VerbForm": "cop"}, + "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Mood": "int", "VerbForm": "cop"}, + "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Mood": "int", "VerbForm": "cop"}, + "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Mood": "int", "VerbForm": "cop"}, + "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "PartType": "comp", "VerbForm": "cop"}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "VerbForm": "cop"}, + "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "VerbForm": "cop"}, + "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "VerbForm": "cop"}, + "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "VerbForm": "cop"}, "AUX___": {"pos": "AUX"}, - "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}}, - "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}}, - "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}}, - "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}}, + "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "VerbForm": "cop"}, + "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "VerbForm": "cop"}, + "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "VerbForm": "cop"}, + "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "VerbForm": "cop"}, + "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "VerbForm": "cop"}, + "AUX__VerbForm=Cop": {"pos": "AUX", "VerbForm": "cop"}, "CCONJ___": {"pos": "CCONJ"}, "DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, - "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}}, + "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Form": "ecl"}, "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"}, "DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"}, "DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"}, "DET__Definite=Def": {"pos": "DET", "Definite": "def"}, - "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}}, + "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Form": "hpref"}, "DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"}, "DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"}, "DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"}, @@ -103,33 +103,33 @@ TAG_MAP = { "DET__PronType=Dem": {"pos": "DET", "PronType": "dem"}, "DET__PronType=Ind": {"pos": "DET", "PronType": "ind"}, "NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Form": "len"}, "NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"}, "NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"}, - "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "NounType": "strong"}, "NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, - "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "NounType": "strong"}, + "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "NounType": "weak"}, "NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"}, "NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}}, - "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, - "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}}, - "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}}, - "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}}, - "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}}, - "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Form": "ecl", "NounType": "strong"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl", "NounType": "strong"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl", "NounType": "weak"}, + "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "hpref"}, + "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "len"}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "len", "NounType": "strong"}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "len", "NounType": "weak"}, + "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"}, + "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Form": "len"}, + "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "NounType": "strong"}, + "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "NounType": "weak"}, "NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"}, "NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"}, - "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}}, - "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "strong"}, + "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "weak"}, "NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"}, "NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"}, "NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"}, @@ -140,79 +140,79 @@ TAG_MAP = { "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, "NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, - "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "ecl"}, + "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "emp"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "hpref"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "len"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "len"}, + "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"}, "NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"}, "NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, "NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, "NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, "NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"}, - "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}}, - "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Form": "len"}, + "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Form": "len"}, "NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"}, "NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"}, "NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"}, - "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}}, - "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}}, - "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}}, - "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}}, - "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}}, - "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}}, + "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Form": "ecl"}, + "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "ecl"}, + "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Form": "ecl"}, + "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "hpref"}, + "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Form": "len"}, + "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "len"}, "NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"}, - "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}}, + "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "PartType": "comp"}, "NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"}, "NOUN___": {"pos": "NOUN"}, "NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"}, "NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"}, "NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"}, "NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"}, - "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}}, - "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}}, - "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}}, - "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}}, - "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}}, + "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "ecl"}, + "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Form": "ecl"}, + "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "hpref"}, + "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "len"}, + "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Form": "len"}, "NUM__NumType=Card": {"pos": "NUM", "NumType": "card"}, "NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"}, "NUM___": {"pos": "NUM"}, - "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}}, - "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}}, - "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}}, - "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}}, - "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}}, - "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}}, - "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}}, - "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}}, - "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}}, - "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}}, - "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}}, + "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Form": "ecl", "PartType": "vb"}, + "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "PartType": "vb"}, + "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "PartType": "vb"}, + "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Mood": "int", "PartType": "vb"}, + "PART__PartType=Ad": {"pos": "PART", "PartType": "ad"}, + "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "PartType": "cmpl"}, + "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "PartType": "cmpl"}, + "PART__PartType=Cmpl": {"pos": "PART", "PartType": "cmpl"}, + "PART__PartType=Comp": {"pos": "PART", "PartType": "comp"}, + "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "PartType": "cop"}, + "PART__PartType=Deg": {"pos": "PART", "PartType": "deg"}, "PART__PartType=Inf": {"pos": "PART", "PartType": "inf"}, - "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}}, - "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}}, - "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}}, - "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}}, + "PART__PartType=Num": {"pos": "PART", "PartType": "num"}, + "PART__PartType=Pat": {"pos": "PART", "PartType": "pat"}, + "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "PartType": "vb"}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "PartType": "vb"}, + "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Vb": {"pos": "PART", "PartType": "vb"}, + "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "PartType": "vb"}, + "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "PartType": "vb"}, + "PART__PartType=Voc": {"pos": "PART", "PartType": "voc"}, "PART___": {"pos": "PART"}, "PART__PronType=Rel": {"pos": "PART", "PronType": "rel"}, - "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, - "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}}, + "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Form": "len"}, + "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Form": "len"}, "PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3}, "PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3}, "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"}, @@ -232,103 +232,103 @@ TAG_MAP = { "PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"}, "PRON__PronType=Int": {"pos": "PRON", "PronType": "int"}, "PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"}, - "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}}, + "PROPN__Abbr=Yes": {"pos": "PROPN", "Abbr": "yes"}, "PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"}, "PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"}, - "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}}, - "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}}, - "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}}, - "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}}, - "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, - "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}}, + "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Form": "ecl"}, + "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl"}, + "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "hpref"}, + "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "len"}, + "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Form": "len"}, + "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"}, + "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Form": "len"}, "PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"}, "PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"}, - "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}}, + "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "weak"}, "PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"}, "PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"}, "PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"}, "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"}, "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"}, - "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}}, - "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}}, - "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}}, - "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}}, - "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "ecl"}, + "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "ecl"}, + "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "hpref"}, + "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"}, + "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"}, "PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"}, "PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"}, "PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"}, "PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"}, - "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}}, + "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Form": "len"}, "PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"}, "PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"}, "PROPN___": {"pos": "PROPN"}, "PUNCT___": {"pos": "PUNCT"}, "SCONJ___": {"pos": "SCONJ"}, - "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}}, - "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}}, - "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}}, + "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "VerbForm": "cop"}, + "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "VerbForm": "cop"}, + "SYM__Abbr=Yes": {"pos": "SYM", "Abbr": "yes"}, "VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"}, - "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, + "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Dialect": "munster", "Form": "len"}, "VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"}, - "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}}, - "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}}, - "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}}, - "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, - "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}}, - "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}}, - "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}}, - "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}}, - "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}}, - "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}}, + "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "ecl", "Voice": "auto"}, + "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Form": "ecl"}, + "VERB__Form=Ecl": {"pos": "VERB", "Form": "ecl"}, + "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Form": "emp"}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Form": "emp"}, + "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Form": "emp"}, + "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Form": "len"}, + "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Form": "len"}, + "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "len", "Voice": "auto"}, + "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Form": "len"}, + "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Form": "len"}, + "VERB__Form=Len": {"pos": "VERB", "Form": "len"}, "VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3}, "VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1}, "VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"}, - "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}}, + "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Voice": "auto"}, "VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"}, "VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1}, "VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2}, @@ -338,28 +338,28 @@ TAG_MAP = { "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"}, "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"}, "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"}, - "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Voice": "auto"}, "VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"}, "VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"}, "VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"}, "VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"}, - "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Voice": "auto"}, "VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"}, - "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Voice": "auto"}, "VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"}, - "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}}, + "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Voice": "auto"}, "VERB___": {"pos": "VERB"}, - "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}}, + "X__Abbr=Yes": {"pos": "X", "Abbr": "yes"}, "X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"}, - "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}}, - "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}}, - "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}}, - "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}}, - "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}}, - "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}}, - "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}}, - "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}}, - "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}}, + "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Dialect": "ulster"}, + "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Dialect": "munster", "Form": "len"}, + "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Dialect": "munster"}, + "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Dialect": "munster", "Voice": "auto"}, + "X__Dialect=Munster": {"pos": "X", "Dialect": "munster"}, + "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Dialect": "munster"}, + "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Dialect": "ulster"}, + "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Dialect": "ulster", "PartType": "vb"}, + "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Dialect": "ulster", "VerbForm": "cop"}, "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"}, "X___": {"pos": "X"} } diff --git a/spacy/lang/nb/morph_rules.py b/spacy/lang/nb/morph_rules.py index b1799fca8..e96b9fd6b 100644 --- a/spacy/lang/nb/morph_rules.py +++ b/spacy/lang/nb/morph_rules.py @@ -195,7 +195,7 @@ MORPH_RULES = { "seg": { LEMMA: PRON_LEMMA, "Person": "Three", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Reflex": "Yes", } }, @@ -248,7 +248,7 @@ MORPH_RULES = { }, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Sing", "Poss": "Yes", "Gender": "Masc", @@ -309,7 +309,7 @@ MORPH_RULES = { }, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Sing", "Poss": "Yes", "Gender": "Fem", @@ -370,7 +370,7 @@ MORPH_RULES = { }, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neut", @@ -400,7 +400,7 @@ MORPH_RULES = { "våre": {LEMMA: "vår", "Person": "One", "Number": "Plur", "Poss": "Yes"}, "deres": { LEMMA: "deres", - "Person": ("Two", "Three"), + "Person": "Two,Three", "Number": "Plur", "Poss": "Yes", }, @@ -448,21 +448,21 @@ MORPH_RULES = { "PronType": "Prs", "Number": "Sing", "Person": "Three", - "Gender": ("Fem", "Masc"), + "Gender": "Fem,Masc", }, "den": { LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", "Person": "Three", - "Gender": ("Fem", "Masc"), + "Gender": "Fem,Masc", }, "ingen": { LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", "Person": "Three", - "Gender": ("Fem", "Masc"), + "Gender": "Fem,Masc", "Polarity": "Neg", }, }, @@ -475,7 +475,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", - "Case": ("Gen", "Nom"), + "Case": "Gen,Nom", } }, "PRON__Animacy=Anim|Case=Gen|Number=Sing|PronType=Prs": { diff --git a/spacy/lang/sv/morph_rules.py b/spacy/lang/sv/morph_rules.py index 8fca20a49..3ef6aedc5 100644 --- a/spacy/lang/sv/morph_rules.py +++ b/spacy/lang/sv/morph_rules.py @@ -105,7 +105,7 @@ MORPH_RULES = { "PronType": "Prs", "Person": "Three", "Number": "Plur", - "Case": ("Nom", "Acc"), + "Case": "Nom,Acc", }, "dem": { LEMMA: PRON_LEMMA, @@ -166,7 +166,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes", @@ -175,7 +175,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes", @@ -184,7 +184,7 @@ MORPH_RULES = { LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", - "Number": ("Sing", "Plur"), + "Number": "Sing,Plur", "Poss": "Yes", "Reflex": "Yes", }, @@ -272,7 +272,7 @@ MORPH_RULES = { "VBZ": { "är": { "VerbForm": "Fin", - "Person": ("One", "Two", "Three"), + "Person": "One,Two,Three", "Tense": "Pres", "Mood": "Ind", } diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py deleted file mode 100644 index 1ecbddc49..000000000 --- a/spacy/lang/uk/tag_map.py +++ /dev/null @@ -1,25 +0,0 @@ -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE}, -} diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index e0940edb7..d990fd46a 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -10,8 +10,8 @@ TAG_MAP = { '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "$": {POS: SYM, "SymType": "currency"}, + "#": {POS: SYM, "SymType": "numbersign"}, "AFX": {POS: ADJ, "Hyph": "yes"}, "CC": {POS: CCONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, @@ -52,7 +52,7 @@ TAG_MAP = { "VerbForm": "fin", "Tense": "pres", "Number": "sing", - "Person": 3, + "Person": "3", }, "WDT": {POS: ADJ, "PronType": "int|rel"}, "WP": {POS: NOUN, "PronType": "int|rel"}, diff --git a/spacy/lang/vi/tag_map.py b/spacy/lang/vi/tag_map.py deleted file mode 100644 index 1ecbddc49..000000000 --- a/spacy/lang/vi/tag_map.py +++ /dev/null @@ -1,25 +0,0 @@ -from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE}, -} diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 1a3cedf97..1e8c255b8 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -2,6 +2,7 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap, PreshMapArray from libc.stdint cimport uint64_t from murmurhash cimport mrmr +cimport numpy as np from .structs cimport TokenC, MorphAnalysisC from .strings cimport StringStore @@ -20,12 +21,11 @@ cdef class Morphology: cdef readonly object tag_names cdef readonly object reverse_index cdef readonly object exc - cdef readonly object _feat_map cdef readonly PreshMapArray _cache cdef readonly int n_tags - cpdef update(self, hash_t morph, features) - cdef hash_t insert(self, MorphAnalysisC tag) except 0 + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * + cdef int insert(self, MorphAnalysisC tag) except -1 cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 @@ -34,8 +34,7 @@ cdef class Morphology: cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 -cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil -cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil -cdef list list_features(const MorphAnalysisC* tag) - -cdef tag_to_json(const MorphAnalysisC* tag) +cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil +cdef list list_features(const MorphAnalysisC* morph) +cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field) +cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 8030a9a28..3003d118f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -2,6 +2,7 @@ from libc.string cimport memset import srsly from collections import Counter +import numpy from .strings import get_string_id from . import symbols @@ -10,130 +11,38 @@ from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme -from .errors import Errors +from .errors import Errors, Warnings, user_warning from .util import ensure_path -cdef enum univ_field_t: - Field_POS - Field_Abbr - Field_AdpType - Field_AdvType - Field_Animacy - Field_Aspect - Field_Case - Field_ConjType - Field_Connegative - Field_Definite - Field_Degree - Field_Derivation - Field_Echo - Field_Foreign - Field_Gender - Field_Hyph - Field_InfForm - Field_Mood - Field_NameType - Field_Negative - Field_NounType - Field_Number - Field_NumForm - Field_NumType - Field_NumValue - Field_PartForm - Field_PartType - Field_Person - Field_Polarity - Field_Polite - Field_Poss - Field_Prefix - Field_PrepCase - Field_PronType - Field_PunctSide - Field_PunctType - Field_Reflex - Field_Style - Field_StyleVariant - Field_Tense - Field_Typo - Field_VerbForm - Field_VerbType - Field_Voice - - def _normalize_props(props): - """Transform deprecated string keys to correct names.""" + """Convert attrs dict so that POS is always by ID, other features are left + as is as long as they are strings or IDs. + """ out = {} props = dict(props) - for key in FIELDS: - if key in props: - value = str(props[key]).lower() - # We don't have support for disjunctive int|rel features, so - # just take the first one :( - if "|" in value: - value = value.split("|")[0] - attr = f"{key}_{value}" - if attr in FEATURES: - props.pop(key) - props[attr] = True for key, value in props.items(): + # convert POS value to ID if key == POS: if hasattr(value, 'upper'): value = value.upper() if value in POS_IDS: value = POS_IDS[value] out[key] = value - elif isinstance(key, int): - out[key] = value - elif value is True: - out[key] = value - elif key.lower() == 'pos': + elif isinstance(key, str) and key.lower() == 'pos': out[POS] = POS_IDS[value.upper()] - elif key.lower() != 'morph': + # sort values + elif isinstance(value, str) and Morphology.VALUE_SEP in value: + out[key] = Morphology.VALUE_SEP.join( + sorted(value.split(Morphology.VALUE_SEP))) + # accept any string or ID fields and values + elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value + else: + user_warning(Warnings.W028.format(feature={key: value})) return out -class MorphologyClassMap(object): - def __init__(self, features): - self.features = tuple(features) - self.fields = [] - self.feat2field = {} - seen_fields = set() - for feature in features: - field = feature.split("_", 1)[0] - if field not in seen_fields: - self.fields.append(field) - seen_fields.add(field) - self.feat2field[feature] = FIELDS[field] - self.id2feat = {get_string_id(name): name for name in features} - self.field2feats = {"POS": []} - self.col2info = [] - self.attr2field = dict(LOWER_FIELDS.items()) - self.feat2offset = {} - self.field2col = {} - self.field2id = dict(FIELDS.items()) - self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()} - for feature in features: - field = self.fields[self.feat2field[feature]] - if field not in self.field2col: - self.field2col[field] = len(self.col2info) - if field != "POS" and field not in self.field2feats: - self.col2info.append((field, 0, "NIL")) - self.field2feats.setdefault(field, ["NIL"]) - offset = len(self.field2feats[field]) - self.field2feats[field].append(feature) - self.col2info.append((field, offset, feature)) - self.feat2offset[feature] = offset - - @property - def field_sizes(self): - return [len(self.field2feats[field]) for field in self.fields] - - def get_field_offset(self, field): - return self.field2col[field] - - cdef class Morphology: '''Store the possible morphological analyses for a language, and index them by hash. @@ -142,9 +51,15 @@ cdef class Morphology: analysis, so queries of morphological attributes are delegated to this class. ''' - def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): + + FEATURE_SEP = "|" + FIELD_SEP = "=" + VALUE_SEP = "," + EMPTY_MORPH = "_" + + def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): self.mem = Pool() - self.strings = string_store + self.strings = strings self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. @@ -158,7 +73,6 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} - self._feat_map = MorphologyClassMap(FEATURES) self._load_from_tag_map(tag_map) self._cache = PreshMapArray(self.n_tags) @@ -172,8 +86,7 @@ cdef class Morphology: def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) - self.add({self._feat_map.id2feat[feat] for feat in attrs - if feat in self._feat_map.id2feat}) + self.add(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i @@ -182,40 +95,78 @@ cdef class Morphology: self.exc), None, None) def add(self, features): - """Insert a morphological analysis in the morphology table, if not already - present. Returns the hash of the new analysis. + """Insert a morphological analysis in the morphology table, if not + already present. The morphological analysis may be provided in the UD + FEATS format as a string or in the tag map dict format. + Returns the hash of the new analysis. + """ + cdef MorphAnalysisC* tag_ptr + if features == self.EMPTY_MORPH: + features = "" + if isinstance(features, str): + tag_ptr = self.tags.get(self.strings[features]) + if tag_ptr != NULL: + return tag_ptr.key + features = self.feats_to_dict(features) + if not isinstance(features, dict): + user_warning(Warnings.W028.format(feature=features)) + features = {} + features = _normalize_props(features) + string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} + # normalized UFEATS string with sorted fields and values + norm_feats_string = self.FEATURE_SEP.join(sorted([ + self.FIELD_SEP.join([field, values]) + for field, values in string_features.items() + ])) + # intified ("Field", "Field=Value") pairs + field_feature_pairs = [] + for field in sorted(string_features): + values = string_features[field] + for value in values.split(self.VALUE_SEP): + field_feature_pairs.append(( + self.strings.add(field), + self.strings.add(field + self.FIELD_SEP + value), + )) + cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs) + # the hash key for the tag is either the hash of the normalized UFEATS + # string or the hash of an empty placeholder (using the empty string + # would give a hash key of 0, which is not good for PreshMap) + if norm_feats_string: + tag.key = self.strings.add(norm_feats_string) + else: + tag.key = self.strings.add(self.EMPTY_MORPH) + self.insert(tag) + return tag.key + + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: + """Creates a MorphAnalysisC from a list of intified + ("Field", "Field=Value") tuples where fields with multiple values have + been split into individual tuples, e.g.: + [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"), + ("Field2", "Field2=Value3")] """ - for f in features: - if isinstance(f, str): - self.strings.add(f) - string_features = features - features = intify_features(features) - cdef attr_t feature - for feature in features: - if feature != 0 and feature not in self._feat_map.id2feat: - raise ValueError(Errors.E167.format(feat=self.strings[feature], feat_id=feature)) cdef MorphAnalysisC tag - tag = create_rich_tag(features) - cdef hash_t key = self.insert(tag) - return key + tag.length = len(field_feature_pairs) + tag.fields = self.mem.alloc(tag.length, sizeof(attr_t)) + tag.features = self.mem.alloc(tag.length, sizeof(attr_t)) + for i, (field, feature) in enumerate(field_feature_pairs): + tag.fields[i] = field + tag.features[i] = feature + return tag + + cdef int insert(self, MorphAnalysisC tag) except -1: + cdef hash_t key = tag.key + if self.tags.get(key) == NULL: + tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC)) + tag_ptr[0] = tag + self.tags.set(key, tag_ptr) def get(self, hash_t morph): tag = self.tags.get(morph) if tag == NULL: return [] else: - return tag_to_json(tag) - - cpdef update(self, hash_t morph, features): - """Update a morphological analysis with new feature values.""" - tag = (self.tags.get(morph))[0] - features = intify_features(features) - cdef attr_t feature - for feature in features: - field = FEATURE_FIELDS[FEATURE_NAMES[feature]] - set_feature(&tag, field, feature, 1) - morph = self.insert(tag) - return morph + return self.strings[tag.key] def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): if orth not in self.strings: @@ -249,19 +200,10 @@ cdef class Morphology: """ attrs = dict(attrs) attrs = _normalize_props(attrs) - self.add({self._feat_map.id2feat[feat] for feat in attrs - if feat in self._feat_map.id2feat}) + self.add(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs - cdef hash_t insert(self, MorphAnalysisC tag) except 0: - cdef hash_t key = hash_tag(tag) - if self.tags.get(key) == NULL: - tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC)) - tag_ptr[0] = tag - self.tags.set(key, tag_ptr) - return key - cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses the lemmatizer's lookup() method, which looks up the string in the @@ -322,782 +264,60 @@ cdef class Morphology: for form_str, attrs in entries.items(): self.add_special_case(tag_str, form_str, attrs) - @classmethod - def create_class_map(cls): - return MorphologyClassMap(FEATURES) + @staticmethod + def feats_to_dict(feats): + if not feats: + return {} + return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in + [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} + + @staticmethod + def dict_to_feats(feats_dict): + if len(feats_dict) == 0: + return "" + return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()])) + + @staticmethod + def list_to_feats(feats_list): + if len(feats_list) == 0: + return "" + feats_dict = {} + for feat in feats_list: + field, value = feat.split(Morphology.FIELD_SEP) + if field not in feats_dict: + feats_dict[field] = set() + feats_dict[field].add(value) + feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()} + return Morphology.dict_to_feats(feats_dict) -cpdef univ_pos_t get_int_tag(pos_): - return 0 - -cpdef intify_features(features): - return {get_string_id(feature) for feature in features} - -cdef hash_t hash_tag(MorphAnalysisC tag) nogil: - return mrmr.hash64(&tag, sizeof(tag), 0) +cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil: + cdef int i + for i in range(morph.length): + if morph.features[i] == feature: + return True + return False -cdef MorphAnalysisC create_rich_tag(features) except *: - cdef MorphAnalysisC tag - cdef attr_t feature - memset(&tag, 0, sizeof(tag)) - for feature in features: - field = FEATURE_FIELDS[FEATURE_NAMES[feature]] - set_feature(&tag, field, feature, 1) - return tag +cdef list list_features(const MorphAnalysisC* morph): + cdef int i + features = [] + for i in range(morph.length): + features.append(morph.features[i]) + return features -cdef tag_to_json(const MorphAnalysisC* tag): - return [FEATURE_NAMES[f] for f in list_features(tag)] +cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field): + cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64") + n = get_n_by_field(results.data, morph, field) + return results[:n] -cdef MorphAnalysisC tag_from_json(json_tag): - raise NotImplementedError - - -cdef list list_features(const MorphAnalysisC* tag): - output = [] - if tag.abbr != 0: - output.append(tag.abbr) - if tag.adp_type != 0: - output.append(tag.adp_type) - if tag.adv_type != 0: - output.append(tag.adv_type) - if tag.animacy != 0: - output.append(tag.animacy) - if tag.aspect != 0: - output.append(tag.aspect) - if tag.case != 0: - output.append(tag.case) - if tag.conj_type != 0: - output.append(tag.conj_type) - if tag.connegative != 0: - output.append(tag.connegative) - if tag.definite != 0: - output.append(tag.definite) - if tag.degree != 0: - output.append(tag.degree) - if tag.derivation != 0: - output.append(tag.derivation) - if tag.echo != 0: - output.append(tag.echo) - if tag.foreign != 0: - output.append(tag.foreign) - if tag.gender != 0: - output.append(tag.gender) - if tag.hyph != 0: - output.append(tag.hyph) - if tag.inf_form != 0: - output.append(tag.inf_form) - if tag.mood != 0: - output.append(tag.mood) - if tag.negative != 0: - output.append(tag.negative) - if tag.number != 0: - output.append(tag.number) - if tag.name_type != 0: - output.append(tag.name_type) - if tag.noun_type != 0: - output.append(tag.noun_type) - if tag.part_form != 0: - output.append(tag.part_form) - if tag.part_type != 0: - output.append(tag.part_type) - if tag.person != 0: - output.append(tag.person) - if tag.polite != 0: - output.append(tag.polite) - if tag.polarity != 0: - output.append(tag.polarity) - if tag.poss != 0: - output.append(tag.poss) - if tag.prefix != 0: - output.append(tag.prefix) - if tag.prep_case != 0: - output.append(tag.prep_case) - if tag.pron_type != 0: - output.append(tag.pron_type) - if tag.punct_type != 0: - output.append(tag.punct_type) - if tag.reflex != 0: - output.append(tag.reflex) - if tag.style != 0: - output.append(tag.style) - if tag.style_variant != 0: - output.append(tag.style_variant) - if tag.typo != 0: - output.append(tag.typo) - if tag.verb_form != 0: - output.append(tag.verb_form) - if tag.voice != 0: - output.append(tag.voice) - if tag.verb_type != 0: - output.append(tag.verb_type) - return output - - -cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil: - field = field_id - if field == Field_POS: - return tag.pos - if field == Field_Abbr: - return tag.abbr - elif field == Field_AdpType: - return tag.adp_type - elif field == Field_AdvType: - return tag.adv_type - elif field == Field_Animacy: - return tag.animacy - elif field == Field_Aspect: - return tag.aspect - elif field == Field_Case: - return tag.case - elif field == Field_ConjType: - return tag.conj_type - elif field == Field_Connegative: - return tag.connegative - elif field == Field_Definite: - return tag.definite - elif field == Field_Degree: - return tag.degree - elif field == Field_Derivation: - return tag.derivation - elif field == Field_Echo: - return tag.echo - elif field == Field_Foreign: - return tag.foreign - elif field == Field_Gender: - return tag.gender - elif field == Field_Hyph: - return tag.hyph - elif field == Field_InfForm: - return tag.inf_form - elif field == Field_Mood: - return tag.mood - elif field == Field_Negative: - return tag.negative - elif field == Field_Number: - return tag.number - elif field == Field_NameType: - return tag.name_type - elif field == Field_NounType: - return tag.noun_type - elif field == Field_NumForm: - return tag.num_form - elif field == Field_NumType: - return tag.num_type - elif field == Field_NumValue: - return tag.num_value - elif field == Field_PartForm: - return tag.part_form - elif field == Field_PartType: - return tag.part_type - elif field == Field_Person: - return tag.person - elif field == Field_Polite: - return tag.polite - elif field == Field_Polarity: - return tag.polarity - elif field == Field_Poss: - return tag.poss - elif field == Field_Prefix: - return tag.prefix - elif field == Field_PrepCase: - return tag.prep_case - elif field == Field_PronType: - return tag.pron_type - elif field == Field_PunctSide: - return tag.punct_side - elif field == Field_PunctType: - return tag.punct_type - elif field == Field_Reflex: - return tag.reflex - elif field == Field_Style: - return tag.style - elif field == Field_StyleVariant: - return tag.style_variant - elif field == Field_Tense: - return tag.tense - elif field == Field_Typo: - return tag.typo - elif field == Field_VerbForm: - return tag.verb_form - elif field == Field_Voice: - return tag.voice - elif field == Field_VerbType: - return tag.verb_type - else: - raise ValueError(Errors.E168.format(field=field_id)) - - -cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil: - if tag.abbr == feature: - return 1 - elif tag.adp_type == feature: - return 1 - elif tag.adv_type == feature: - return 1 - elif tag.animacy == feature: - return 1 - elif tag.aspect == feature: - return 1 - elif tag.case == feature: - return 1 - elif tag.conj_type == feature: - return 1 - elif tag.connegative == feature: - return 1 - elif tag.definite == feature: - return 1 - elif tag.degree == feature: - return 1 - elif tag.derivation == feature: - return 1 - elif tag.echo == feature: - return 1 - elif tag.foreign == feature: - return 1 - elif tag.gender == feature: - return 1 - elif tag.hyph == feature: - return 1 - elif tag.inf_form == feature: - return 1 - elif tag.mood == feature: - return 1 - elif tag.negative == feature: - return 1 - elif tag.number == feature: - return 1 - elif tag.name_type == feature: - return 1 - elif tag.noun_type == feature: - return 1 - elif tag.num_form == feature: - return 1 - elif tag.num_type == feature: - return 1 - elif tag.num_value == feature: - return 1 - elif tag.part_form == feature: - return 1 - elif tag.part_type == feature: - return 1 - elif tag.person == feature: - return 1 - elif tag.polite == feature: - return 1 - elif tag.polarity == feature: - return 1 - elif tag.poss == feature: - return 1 - elif tag.prefix == feature: - return 1 - elif tag.prep_case == feature: - return 1 - elif tag.pron_type == feature: - return 1 - elif tag.punct_side == feature: - return 1 - elif tag.punct_type == feature: - return 1 - elif tag.reflex == feature: - return 1 - elif tag.style == feature: - return 1 - elif tag.style_variant == feature: - return 1 - elif tag.tense == feature: - return 1 - elif tag.typo == feature: - return 1 - elif tag.verb_form == feature: - return 1 - elif tag.voice == feature: - return 1 - elif tag.verb_type == feature: - return 1 - else: - return 0 - -cdef int set_feature(MorphAnalysisC* tag, - univ_field_t field, attr_t feature, int value) except -1: - if value == True: - value_ = feature - else: - value_ = 0 - prev_value = get_field(tag, field) - if prev_value != 0 and value_ == 0 and field != Field_POS: - tag.length -= 1 - elif prev_value == 0 and value_ != 0 and field != Field_POS: - tag.length += 1 - if feature == 0: - pass - elif field == Field_POS: - tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1]) - elif field == Field_Abbr: - tag.abbr = value_ - elif field == Field_AdpType: - tag.adp_type = value_ - elif field == Field_AdvType: - tag.adv_type = value_ - elif field == Field_Animacy: - tag.animacy = value_ - elif field == Field_Aspect: - tag.aspect = value_ - elif field == Field_Case: - tag.case = value_ - elif field == Field_ConjType: - tag.conj_type = value_ - elif field == Field_Connegative: - tag.connegative = value_ - elif field == Field_Definite: - tag.definite = value_ - elif field == Field_Degree: - tag.degree = value_ - elif field == Field_Derivation: - tag.derivation = value_ - elif field == Field_Echo: - tag.echo = value_ - elif field == Field_Foreign: - tag.foreign = value_ - elif field == Field_Gender: - tag.gender = value_ - elif field == Field_Hyph: - tag.hyph = value_ - elif field == Field_InfForm: - tag.inf_form = value_ - elif field == Field_Mood: - tag.mood = value_ - elif field == Field_Negative: - tag.negative = value_ - elif field == Field_Number: - tag.number = value_ - elif field == Field_NameType: - tag.name_type = value_ - elif field == Field_NounType: - tag.noun_type = value_ - elif field == Field_NumForm: - tag.num_form = value_ - elif field == Field_NumType: - tag.num_type = value_ - elif field == Field_NumValue: - tag.num_value = value_ - elif field == Field_PartForm: - tag.part_form = value_ - elif field == Field_PartType: - tag.part_type = value_ - elif field == Field_Person: - tag.person = value_ - elif field == Field_Polite: - tag.polite = value_ - elif field == Field_Polarity: - tag.polarity = value_ - elif field == Field_Poss: - tag.poss = value_ - elif field == Field_Prefix: - tag.prefix = value_ - elif field == Field_PrepCase: - tag.prep_case = value_ - elif field == Field_PronType: - tag.pron_type = value_ - elif field == Field_PunctSide: - tag.punct_side = value_ - elif field == Field_PunctType: - tag.punct_type = value_ - elif field == Field_Reflex: - tag.reflex = value_ - elif field == Field_Style: - tag.style = value_ - elif field == Field_StyleVariant: - tag.style_variant = value_ - elif field == Field_Tense: - tag.tense = value_ - elif field == Field_Typo: - tag.typo = value_ - elif field == Field_VerbForm: - tag.verb_form = value_ - elif field == Field_Voice: - tag.voice = value_ - elif field == Field_VerbType: - tag.verb_type = value_ - else: - raise ValueError(Errors.E167.format(field=FEATURE_NAMES.get(feature), field_id=feature)) - - -FIELDS = { - 'POS': Field_POS, - 'Abbr': Field_Abbr, - 'AdpType': Field_AdpType, - 'AdvType': Field_AdvType, - 'Animacy': Field_Animacy, - 'Aspect': Field_Aspect, - 'Case': Field_Case, - 'ConjType': Field_ConjType, - 'Connegative': Field_Connegative, - 'Definite': Field_Definite, - 'Degree': Field_Degree, - 'Derivation': Field_Derivation, - 'Echo': Field_Echo, - 'Foreign': Field_Foreign, - 'Gender': Field_Gender, - 'Hyph': Field_Hyph, - 'InfForm': Field_InfForm, - 'Mood': Field_Mood, - 'NameType': Field_NameType, - 'Negative': Field_Negative, - 'NounType': Field_NounType, - 'Number': Field_Number, - 'NumForm': Field_NumForm, - 'NumType': Field_NumType, - 'NumValue': Field_NumValue, - 'PartForm': Field_PartForm, - 'PartType': Field_PartType, - 'Person': Field_Person, - 'Polite': Field_Polite, - 'Polarity': Field_Polarity, - 'Poss': Field_Poss, - 'Prefix': Field_Prefix, - 'PrepCase': Field_PrepCase, - 'PronType': Field_PronType, - 'PunctSide': Field_PunctSide, - 'PunctType': Field_PunctType, - 'Reflex': Field_Reflex, - 'Style': Field_Style, - 'StyleVariant': Field_StyleVariant, - 'Tense': Field_Tense, - 'Typo': Field_Typo, - 'VerbForm': Field_VerbForm, - 'VerbType': Field_VerbType, - 'Voice': Field_Voice, -} - -LOWER_FIELDS = { - 'pos': Field_POS, - 'abbr': Field_Abbr, - 'adp_type': Field_AdpType, - 'adv_type': Field_AdvType, - 'animacy': Field_Animacy, - 'aspect': Field_Aspect, - 'case': Field_Case, - 'conj_type': Field_ConjType, - 'connegative': Field_Connegative, - 'definite': Field_Definite, - 'degree': Field_Degree, - 'derivation': Field_Derivation, - 'echo': Field_Echo, - 'foreign': Field_Foreign, - 'gender': Field_Gender, - 'hyph': Field_Hyph, - 'inf_form': Field_InfForm, - 'mood': Field_Mood, - 'name_type': Field_NameType, - 'negative': Field_Negative, - 'noun_type': Field_NounType, - 'number': Field_Number, - 'num_form': Field_NumForm, - 'num_type': Field_NumType, - 'num_value': Field_NumValue, - 'part_form': Field_PartForm, - 'part_type': Field_PartType, - 'person': Field_Person, - 'polarity': Field_Polarity, - 'polite': Field_Polite, - 'poss': Field_Poss, - 'prefix': Field_Prefix, - 'prep_case': Field_PrepCase, - 'pron_type': Field_PronType, - 'punct_side': Field_PunctSide, - 'punct_type': Field_PunctType, - 'reflex': Field_Reflex, - 'style': Field_Style, - 'style_variant': Field_StyleVariant, - 'tense': Field_Tense, - 'typo': Field_Typo, - 'verb_form': Field_VerbForm, - 'verb_type': Field_VerbType, - 'voice': Field_Voice, -} - - -FEATURES = [ - "POS_ADJ", - "POS_ADP", - "POS_ADV", - "POS_AUX", - "POS_CONJ", - "POS_CCONJ", - "POS_DET", - "POS_INTJ", - "POS_NOUN", - "POS_NUM", - "POS_PART", - "POS_PRON", - "POS_PROPN", - "POS_PUNCT", - "POS_SCONJ", - "POS_SYM", - "POS_VERB", - "POS_X", - "POS_EOL", - "POS_SPACE", - "Abbr_yes", - "AdpType_circ", - "AdpType_comprep", - "AdpType_prep", - "AdpType_post", - "AdpType_voc", - "AdvType_adadj", - "AdvType_cau", - "AdvType_deg", - "AdvType_ex", - "AdvType_loc", - "AdvType_man", - "AdvType_mod", - "AdvType_sta", - "AdvType_tim", - "Animacy_anim", - "Animacy_hum", - "Animacy_inan", - "Animacy_nhum", - "Aspect_hab", - "Aspect_imp", - "Aspect_iter", - "Aspect_perf", - "Aspect_prog", - "Aspect_prosp", - "Aspect_none", - "Case_abe", - "Case_abl", - "Case_abs", - "Case_acc", - "Case_ade", - "Case_all", - "Case_cau", - "Case_com", - "Case_dat", - "Case_del", - "Case_dis", - "Case_ela", - "Case_ess", - "Case_gen", - "Case_ill", - "Case_ine", - "Case_ins", - "Case_loc", - "Case_lat", - "Case_nom", - "Case_par", - "Case_sub", - "Case_sup", - "Case_tem", - "Case_ter", - "Case_tra", - "Case_voc", - "ConjType_comp", - "ConjType_oper", - "Connegative_yes", - "Definite_cons", - "Definite_def", - "Definite_ind", - "Definite_red", - "Definite_two", - "Degree_abs", - "Degree_cmp", - "Degree_comp", - "Degree_none", - "Degree_pos", - "Degree_sup", - "Degree_com", - "Degree_dim", - "Derivation_minen", - "Derivation_sti", - "Derivation_inen", - "Derivation_lainen", - "Derivation_ja", - "Derivation_ton", - "Derivation_vs", - "Derivation_ttain", - "Derivation_ttaa", - "Echo_rdp", - "Echo_ech", - "Foreign_foreign", - "Foreign_fscript", - "Foreign_tscript", - "Foreign_yes", - "Gender_com", - "Gender_fem", - "Gender_masc", - "Gender_neut", - "Gender_dat_masc", - "Gender_dat_fem", - "Gender_erg_masc", - "Gender_erg_fem", - "Gender_psor_masc", - "Gender_psor_fem", - "Gender_psor_neut", - "Hyph_yes", - "InfForm_one", - "InfForm_two", - "InfForm_three", - "Mood_cnd", - "Mood_imp", - "Mood_ind", - "Mood_n", - "Mood_pot", - "Mood_sub", - "Mood_opt", - "NameType_geo", - "NameType_prs", - "NameType_giv", - "NameType_sur", - "NameType_nat", - "NameType_com", - "NameType_pro", - "NameType_oth", - "Negative_neg", - "Negative_pos", - "Negative_yes", - "NounType_com", - "NounType_prop", - "NounType_class", - "Number_com", - "Number_dual", - "Number_none", - "Number_plur", - "Number_sing", - "Number_ptan", - "Number_count", - "Number_abs_sing", - "Number_abs_plur", - "Number_dat_sing", - "Number_dat_plur", - "Number_erg_sing", - "Number_erg_plur", - "Number_psee_sing", - "Number_psee_plur", - "Number_psor_sing", - "Number_psor_plur", - "NumForm_digit", - "NumForm_roman", - "NumForm_word", - "NumForm_combi", - "NumType_card", - "NumType_dist", - "NumType_frac", - "NumType_gen", - "NumType_mult", - "NumType_none", - "NumType_ord", - "NumType_sets", - "NumType_dual", - "NumValue_one", - "NumValue_two", - "NumValue_three", - "PartForm_pres", - "PartForm_past", - "PartForm_agt", - "PartForm_neg", - "PartType_mod", - "PartType_emp", - "PartType_res", - "PartType_inf", - "PartType_vbp", - "Person_one", - "Person_two", - "Person_three", - "Person_none", - "Person_abs_one", - "Person_abs_two", - "Person_abs_three", - "Person_dat_one", - "Person_dat_two", - "Person_dat_three", - "Person_erg_one", - "Person_erg_two", - "Person_erg_three", - "Person_psor_one", - "Person_psor_two", - "Person_psor_three", - "Polarity_neg", - "Polarity_pos", - "Polite_inf", - "Polite_pol", - "Polite_abs_inf", - "Polite_abs_pol", - "Polite_erg_inf", - "Polite_erg_pol", - "Polite_dat_inf", - "Polite_dat_pol", - "Poss_yes", - "Prefix_yes", - "PrepCase_npr", - "PrepCase_pre", - "PronType_advPart", - "PronType_art", - "PronType_default", - "PronType_dem", - "PronType_ind", - "PronType_int", - "PronType_neg", - "PronType_prs", - "PronType_rcp", - "PronType_rel", - "PronType_tot", - "PronType_clit", - "PronType_exc", - "PunctSide_ini", - "PunctSide_fin", - "PunctType_peri", - "PunctType_qest", - "PunctType_excl", - "PunctType_quot", - "PunctType_brck", - "PunctType_comm", - "PunctType_colo", - "PunctType_semi", - "PunctType_dash", - "Reflex_yes", - "Style_arch", - "Style_rare", - "Style_poet", - "Style_norm", - "Style_coll", - "Style_vrnc", - "Style_sing", - "Style_expr", - "Style_derg", - "Style_vulg", - "Style_yes", - "StyleVariant_styleShort", - "StyleVariant_styleBound", - "Tense_fut", - "Tense_imp", - "Tense_past", - "Tense_pres", - "Typo_yes", - "VerbForm_fin", - "VerbForm_ger", - "VerbForm_inf", - "VerbForm_none", - "VerbForm_part", - "VerbForm_partFut", - "VerbForm_partPast", - "VerbForm_partPres", - "VerbForm_sup", - "VerbForm_trans", - "VerbForm_conv", - "VerbForm_gdv", - "VerbType_aux", - "VerbType_cop", - "VerbType_mod", - "VerbType_light", - "Voice_act", - "Voice_cau", - "Voice_pass", - "Voice_mid", - "Voice_int", -] - -FEATURE_NAMES = {get_string_id(f): f for f in FEATURES} -FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES} +cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil: + cdef int n_results = 0 + cdef int i + for i in range(morph.length): + if morph.fields[i] == field: + results[n_results] = morph.features[i] + n_results += 1 + return n_results diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b3878db3f..259fd657d 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -82,52 +82,11 @@ cdef struct TokenC: cdef struct MorphAnalysisC: - univ_pos_t pos + hash_t key int length - - attr_t abbr - attr_t adp_type - attr_t adv_type - attr_t animacy - attr_t aspect - attr_t case - attr_t conj_type - attr_t connegative - attr_t definite - attr_t degree - attr_t derivation - attr_t echo - attr_t foreign - attr_t gender - attr_t hyph - attr_t inf_form - attr_t mood - attr_t negative - attr_t number - attr_t name_type - attr_t noun_type - attr_t num_form - attr_t num_type - attr_t num_value - attr_t part_form - attr_t part_type - attr_t person - attr_t polite - attr_t polarity - attr_t poss - attr_t prefix - attr_t prep_case - attr_t pron_type - attr_t punct_side - attr_t punct_type - attr_t reflex - attr_t style - attr_t style_variant - attr_t tense - attr_t typo - attr_t verb_form - attr_t voice - attr_t verb_type + attr_t* fields + attr_t* features + # Internal struct, for storage and disambiguation of entities. cdef struct KBEntryC: diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 5922ee588..b95b4b805 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -108,282 +108,282 @@ cdef enum symbol_t: EOL SPACE - Animacy_anim - Animacy_inan - Animacy_hum # U20 - Animacy_nhum - Aspect_freq - Aspect_imp - Aspect_mod - Aspect_none - Aspect_perf - Aspect_iter # U20 - Aspect_hab # U20 - Case_abe - Case_abl - Case_abs - Case_acc - Case_ade - Case_all - Case_cau - Case_com - Case_cmp # U20 - Case_dat - Case_del - Case_dis - Case_ela - Case_equ # U20 - Case_ess - Case_gen - Case_ill - Case_ine - Case_ins - Case_loc - Case_lat - Case_nom - Case_par - Case_sub - Case_sup - Case_tem - Case_ter - Case_tra - Case_voc - Definite_two - Definite_def - Definite_red - Definite_cons # U20 - Definite_ind - Definite_spec # U20 - Degree_cmp - Degree_comp - Degree_none - Degree_pos - Degree_sup - Degree_abs - Degree_com - Degree_dim # du - Degree_equ # U20 - Evident_nfh # U20 - Gender_com - Gender_fem - Gender_masc - Gender_neut - Mood_cnd - Mood_imp - Mood_ind - Mood_n - Mood_pot - Mood_sub - Mood_opt - Mood_prp # U20 - Mood_adm # U20 - Negative_neg - Negative_pos - Negative_yes - Polarity_neg # U20 - Polarity_pos # U20 - Number_com - Number_dual - Number_none - Number_plur - Number_sing - Number_ptan # bg - Number_count # bg, U20 - Number_tri # U20 - NumType_card - NumType_dist - NumType_frac - NumType_gen - NumType_mult - NumType_none - NumType_ord - NumType_sets - Person_one - Person_two - Person_three - Person_none - Poss_yes - PronType_advPart - PronType_art - PronType_default - PronType_dem - PronType_ind - PronType_int - PronType_neg - PronType_prs - PronType_rcp - PronType_rel - PronType_tot - PronType_clit - PronType_exc # es, ca, it, fa, U20 - PronType_emp # U20 - Reflex_yes - Tense_fut - Tense_imp - Tense_past - Tense_pres - VerbForm_fin - VerbForm_ger - VerbForm_inf - VerbForm_none - VerbForm_part - VerbForm_partFut - VerbForm_partPast - VerbForm_partPres - VerbForm_sup - VerbForm_trans - VerbForm_conv # U20 - VerbForm_gdv # la - VerbForm_vnoun # U20 - Voice_act - Voice_cau - Voice_pass - Voice_mid # gkc, U20 - Voice_int # hb - Voice_antip # U20 - Voice_dir # U20 - Voice_inv # U20 - Abbr_yes # cz, fi, sl, U - AdpType_prep # cz, U - AdpType_post # U - AdpType_voc # cz - AdpType_comprep # cz - AdpType_circ # U - AdvType_man - AdvType_loc - AdvType_tim - AdvType_deg - AdvType_cau - AdvType_mod - AdvType_sta - AdvType_ex - AdvType_adadj - ConjType_oper # cz, U - ConjType_comp # cz, U - Connegative_yes # fi - Derivation_minen # fi - Derivation_sti # fi - Derivation_inen # fi - Derivation_lainen # fi - Derivation_ja # fi - Derivation_ton # fi - Derivation_vs # fi - Derivation_ttain # fi - Derivation_ttaa # fi - Echo_rdp # U - Echo_ech # U - Foreign_foreign # cz, fi, U - Foreign_fscript # cz, fi, U - Foreign_tscript # cz, U - Foreign_yes # sl - Gender_dat_masc # bq, U - Gender_dat_fem # bq, U - Gender_erg_masc # bq - Gender_erg_fem # bq - Gender_psor_masc # cz, sl, U - Gender_psor_fem # cz, sl, U - Gender_psor_neut # sl - Hyph_yes # cz, U - InfForm_one # fi - InfForm_two # fi - InfForm_three # fi - NameType_geo # U, cz - NameType_prs # U, cz - NameType_giv # U, cz - NameType_sur # U, cz - NameType_nat # U, cz - NameType_com # U, cz - NameType_pro # U, cz - NameType_oth # U, cz - NounType_com # U - NounType_prop # U - NounType_class # U - Number_abs_sing # bq, U - Number_abs_plur # bq, U - Number_dat_sing # bq, U - Number_dat_plur # bq, U - Number_erg_sing # bq, U - Number_erg_plur # bq, U - Number_psee_sing # U - Number_psee_plur # U - Number_psor_sing # cz, fi, sl, U - Number_psor_plur # cz, fi, sl, U - Number_pauc # U20 - Number_grpa # U20 - Number_grpl # U20 - Number_inv # U20 - NumForm_digit # cz, sl, U - NumForm_roman # cz, sl, U - NumForm_word # cz, sl, U - NumValue_one # cz, U - NumValue_two # cz, U - NumValue_three # cz, U - PartForm_pres # fi - PartForm_past # fi - PartForm_agt # fi - PartForm_neg # fi - PartType_mod # U - PartType_emp # U - PartType_res # U - PartType_inf # U - PartType_vbp # U - Person_abs_one # bq, U - Person_abs_two # bq, U - Person_abs_three # bq, U - Person_dat_one # bq, U - Person_dat_two # bq, U - Person_dat_three # bq, U - Person_erg_one # bq, U - Person_erg_two # bq, U - Person_erg_three # bq, U - Person_psor_one # fi, U - Person_psor_two # fi, U - Person_psor_three # fi, U - Person_zero # U20 - Person_four # U20 - Polite_inf # bq, U - Polite_pol # bq, U - Polite_abs_inf # bq, U - Polite_abs_pol # bq, U - Polite_erg_inf # bq, U - Polite_erg_pol # bq, U - Polite_dat_inf # bq, U - Polite_dat_pol # bq, U - Polite_infm # U20 - Polite_form # U20 - Polite_form_elev # U20 - Polite_form_humb # U20 - Prefix_yes # U - PrepCase_npr # cz - PrepCase_pre # U - PunctSide_ini # U - PunctSide_fin # U - PunctType_peri # U - PunctType_qest # U - PunctType_excl # U - PunctType_quot # U - PunctType_brck # U - PunctType_comm # U - PunctType_colo # U - PunctType_semi # U - PunctType_dash # U - Style_arch # cz, fi, U - Style_rare # cz, fi, U - Style_poet # cz, U - Style_norm # cz, U - Style_coll # cz, U - Style_vrnc # cz, U - Style_sing # cz, U - Style_expr # cz, U - Style_derg # cz, U - Style_vulg # cz, U - Style_yes # fi, U - StyleVariant_styleShort # cz - StyleVariant_styleBound # cz, sl - VerbType_aux # U - VerbType_cop # U - VerbType_mod # U - VerbType_light # U + DEPRECATED001 + DEPRECATED002 + DEPRECATED003 + DEPRECATED004 + DEPRECATED005 + DEPRECATED006 + DEPRECATED007 + DEPRECATED008 + DEPRECATED009 + DEPRECATED010 + DEPRECATED011 + DEPRECATED012 + DEPRECATED013 + DEPRECATED014 + DEPRECATED015 + DEPRECATED016 + DEPRECATED017 + DEPRECATED018 + DEPRECATED019 + DEPRECATED020 + DEPRECATED021 + DEPRECATED022 + DEPRECATED023 + DEPRECATED024 + DEPRECATED025 + DEPRECATED026 + DEPRECATED027 + DEPRECATED028 + DEPRECATED029 + DEPRECATED030 + DEPRECATED031 + DEPRECATED032 + DEPRECATED033 + DEPRECATED034 + DEPRECATED035 + DEPRECATED036 + DEPRECATED037 + DEPRECATED038 + DEPRECATED039 + DEPRECATED040 + DEPRECATED041 + DEPRECATED042 + DEPRECATED043 + DEPRECATED044 + DEPRECATED045 + DEPRECATED046 + DEPRECATED047 + DEPRECATED048 + DEPRECATED049 + DEPRECATED050 + DEPRECATED051 + DEPRECATED052 + DEPRECATED053 + DEPRECATED054 + DEPRECATED055 + DEPRECATED056 + DEPRECATED057 + DEPRECATED058 + DEPRECATED059 + DEPRECATED060 + DEPRECATED061 + DEPRECATED062 + DEPRECATED063 + DEPRECATED064 + DEPRECATED065 + DEPRECATED066 + DEPRECATED067 + DEPRECATED068 + DEPRECATED069 + DEPRECATED070 + DEPRECATED071 + DEPRECATED072 + DEPRECATED073 + DEPRECATED074 + DEPRECATED075 + DEPRECATED076 + DEPRECATED077 + DEPRECATED078 + DEPRECATED079 + DEPRECATED080 + DEPRECATED081 + DEPRECATED082 + DEPRECATED083 + DEPRECATED084 + DEPRECATED085 + DEPRECATED086 + DEPRECATED087 + DEPRECATED088 + DEPRECATED089 + DEPRECATED090 + DEPRECATED091 + DEPRECATED092 + DEPRECATED093 + DEPRECATED094 + DEPRECATED095 + DEPRECATED096 + DEPRECATED097 + DEPRECATED098 + DEPRECATED099 + DEPRECATED100 + DEPRECATED101 + DEPRECATED102 + DEPRECATED103 + DEPRECATED104 + DEPRECATED105 + DEPRECATED106 + DEPRECATED107 + DEPRECATED108 + DEPRECATED109 + DEPRECATED110 + DEPRECATED111 + DEPRECATED112 + DEPRECATED113 + DEPRECATED114 + DEPRECATED115 + DEPRECATED116 + DEPRECATED117 + DEPRECATED118 + DEPRECATED119 + DEPRECATED120 + DEPRECATED121 + DEPRECATED122 + DEPRECATED123 + DEPRECATED124 + DEPRECATED125 + DEPRECATED126 + DEPRECATED127 + DEPRECATED128 + DEPRECATED129 + DEPRECATED130 + DEPRECATED131 + DEPRECATED132 + DEPRECATED133 + DEPRECATED134 + DEPRECATED135 + DEPRECATED136 + DEPRECATED137 + DEPRECATED138 + DEPRECATED139 + DEPRECATED140 + DEPRECATED141 + DEPRECATED142 + DEPRECATED143 + DEPRECATED144 + DEPRECATED145 + DEPRECATED146 + DEPRECATED147 + DEPRECATED148 + DEPRECATED149 + DEPRECATED150 + DEPRECATED151 + DEPRECATED152 + DEPRECATED153 + DEPRECATED154 + DEPRECATED155 + DEPRECATED156 + DEPRECATED157 + DEPRECATED158 + DEPRECATED159 + DEPRECATED160 + DEPRECATED161 + DEPRECATED162 + DEPRECATED163 + DEPRECATED164 + DEPRECATED165 + DEPRECATED166 + DEPRECATED167 + DEPRECATED168 + DEPRECATED169 + DEPRECATED170 + DEPRECATED171 + DEPRECATED172 + DEPRECATED173 + DEPRECATED174 + DEPRECATED175 + DEPRECATED176 + DEPRECATED177 + DEPRECATED178 + DEPRECATED179 + DEPRECATED180 + DEPRECATED181 + DEPRECATED182 + DEPRECATED183 + DEPRECATED184 + DEPRECATED185 + DEPRECATED186 + DEPRECATED187 + DEPRECATED188 + DEPRECATED189 + DEPRECATED190 + DEPRECATED191 + DEPRECATED192 + DEPRECATED193 + DEPRECATED194 + DEPRECATED195 + DEPRECATED196 + DEPRECATED197 + DEPRECATED198 + DEPRECATED199 + DEPRECATED200 + DEPRECATED201 + DEPRECATED202 + DEPRECATED203 + DEPRECATED204 + DEPRECATED205 + DEPRECATED206 + DEPRECATED207 + DEPRECATED208 + DEPRECATED209 + DEPRECATED210 + DEPRECATED211 + DEPRECATED212 + DEPRECATED213 + DEPRECATED214 + DEPRECATED215 + DEPRECATED216 + DEPRECATED217 + DEPRECATED218 + DEPRECATED219 + DEPRECATED220 + DEPRECATED221 + DEPRECATED222 + DEPRECATED223 + DEPRECATED224 + DEPRECATED225 + DEPRECATED226 + DEPRECATED227 + DEPRECATED228 + DEPRECATED229 + DEPRECATED230 + DEPRECATED231 + DEPRECATED232 + DEPRECATED233 + DEPRECATED234 + DEPRECATED235 + DEPRECATED236 + DEPRECATED237 + DEPRECATED238 + DEPRECATED239 + DEPRECATED240 + DEPRECATED241 + DEPRECATED242 + DEPRECATED243 + DEPRECATED244 + DEPRECATED245 + DEPRECATED246 + DEPRECATED247 + DEPRECATED248 + DEPRECATED249 + DEPRECATED250 + DEPRECATED251 + DEPRECATED252 + DEPRECATED253 + DEPRECATED254 + DEPRECATED255 + DEPRECATED256 + DEPRECATED257 + DEPRECATED258 + DEPRECATED259 + DEPRECATED260 + DEPRECATED261 + DEPRECATED262 + DEPRECATED263 + DEPRECATED264 + DEPRECATED265 + DEPRECATED266 + DEPRECATED267 + DEPRECATED268 + DEPRECATED269 + DEPRECATED270 + DEPRECATED271 + DEPRECATED272 + DEPRECATED273 + DEPRECATED274 + DEPRECATED275 + DEPRECATED276 PERSON NORP diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 85f23ccbc..36b9ffa67 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -110,282 +110,282 @@ IDS = { "EOL": EOL, "SPACE": SPACE, - "Animacy_anim": Animacy_anim, - "Animacy_inam": Animacy_inan, - "Animacy_hum": Animacy_hum, # U20 - "Animacy_nhum": Animacy_nhum, - "Aspect_freq": Aspect_freq, - "Aspect_imp": Aspect_imp, - "Aspect_mod": Aspect_mod, - "Aspect_none": Aspect_none, - "Aspect_perf": Aspect_perf, - "Aspect_iter": Aspect_iter, # U20 - "Aspect_hab": Aspect_hab, # U20 - "Case_abe": Case_abe, - "Case_abl": Case_abl, - "Case_abs": Case_abs, - "Case_acc": Case_acc, - "Case_ade": Case_ade, - "Case_all": Case_all, - "Case_cau": Case_cau, - "Case_com": Case_com, - "Case_cmp": Case_cmp, # U20 - "Case_dat": Case_dat, - "Case_del": Case_del, - "Case_dis": Case_dis, - "Case_ela": Case_ela, - "Case_equ": Case_equ, # U20 - "Case_ess": Case_ess, - "Case_gen": Case_gen, - "Case_ill": Case_ill, - "Case_ine": Case_ine, - "Case_ins": Case_ins, - "Case_loc": Case_loc, - "Case_lat": Case_lat, - "Case_nom": Case_nom, - "Case_par": Case_par, - "Case_sub": Case_sub, - "Case_sup": Case_sup, - "Case_tem": Case_tem, - "Case_ter": Case_ter, - "Case_tra": Case_tra, - "Case_voc": Case_voc, - "Definite_two": Definite_two, - "Definite_def": Definite_def, - "Definite_red": Definite_red, - "Definite_cons": Definite_cons, # U20 - "Definite_ind": Definite_ind, - "Definite_spec": Definite_spec, # U20 - "Degree_cmp": Degree_cmp, - "Degree_comp": Degree_comp, - "Degree_none": Degree_none, - "Degree_pos": Degree_pos, - "Degree_sup": Degree_sup, - "Degree_abs": Degree_abs, - "Degree_com": Degree_com, - "Degree_dim": Degree_dim, # du - "Degree_equ": Degree_equ, # U20 - "Evident_nfh": Evident_nfh, # U20 - "Gender_com": Gender_com, - "Gender_fem": Gender_fem, - "Gender_masc": Gender_masc, - "Gender_neut": Gender_neut, - "Mood_cnd": Mood_cnd, - "Mood_imp": Mood_imp, - "Mood_ind": Mood_ind, - "Mood_n": Mood_n, - "Mood_pot": Mood_pot, - "Mood_sub": Mood_sub, - "Mood_opt": Mood_opt, - "Mood_prp": Mood_prp, # U20 - "Mood_adm": Mood_adm, # U20 - "Negative_neg": Negative_neg, - "Negative_pos": Negative_pos, - "Negative_yes": Negative_yes, - "Polarity_neg": Polarity_neg, # U20 - "Polarity_pos": Polarity_pos, # U20 - "Number_com": Number_com, - "Number_dual": Number_dual, - "Number_none": Number_none, - "Number_plur": Number_plur, - "Number_sing": Number_sing, - "Number_ptan": Number_ptan, # bg - "Number_count": Number_count, # bg, U20 - "Number_tri": Number_tri, # U20 - "NumType_card": NumType_card, - "NumType_dist": NumType_dist, - "NumType_frac": NumType_frac, - "NumType_gen": NumType_gen, - "NumType_mult": NumType_mult, - "NumType_none": NumType_none, - "NumType_ord": NumType_ord, - "NumType_sets": NumType_sets, - "Person_one": Person_one, - "Person_two": Person_two, - "Person_three": Person_three, - "Person_none": Person_none, - "Poss_yes": Poss_yes, - "PronType_advPart": PronType_advPart, - "PronType_art": PronType_art, - "PronType_default": PronType_default, - "PronType_dem": PronType_dem, - "PronType_ind": PronType_ind, - "PronType_int": PronType_int, - "PronType_neg": PronType_neg, - "PronType_prs": PronType_prs, - "PronType_rcp": PronType_rcp, - "PronType_rel": PronType_rel, - "PronType_tot": PronType_tot, - "PronType_clit": PronType_clit, - "PronType_exc": PronType_exc, # es, ca, it, fa, U20 - "PronType_emp": PronType_emp, # U20 - "Reflex_yes": Reflex_yes, - "Tense_fut": Tense_fut, - "Tense_imp": Tense_imp, - "Tense_past": Tense_past, - "Tense_pres": Tense_pres, - "VerbForm_fin": VerbForm_fin, - "VerbForm_ger": VerbForm_ger, - "VerbForm_inf": VerbForm_inf, - "VerbForm_none": VerbForm_none, - "VerbForm_part": VerbForm_part, - "VerbForm_partFut": VerbForm_partFut, - "VerbForm_partPast": VerbForm_partPast, - "VerbForm_partPres": VerbForm_partPres, - "VerbForm_sup": VerbForm_sup, - "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, # U20 - "VerbForm_gdv": VerbForm_gdv, # la, - "VerbForm_vnoun": VerbForm_vnoun, # U20 - "Voice_act": Voice_act, - "Voice_cau": Voice_cau, - "Voice_pass": Voice_pass, - "Voice_mid": Voice_mid, # gkc, U20 - "Voice_int": Voice_int, # hb, - "Voice_antip": Voice_antip, # U20 - "Voice_dir": Voice_dir, # U20 - "Voice_inv": Voice_inv, # U20 - "Abbr_yes": Abbr_yes, # cz, fi, sl, U, - "AdpType_prep": AdpType_prep, # cz, U, - "AdpType_post": AdpType_post, # U, - "AdpType_voc": AdpType_voc, # cz, - "AdpType_comprep": AdpType_comprep, # cz, - "AdpType_circ": AdpType_circ, # U, - "AdvType_man": AdvType_man, - "AdvType_loc": AdvType_loc, - "AdvType_tim": AdvType_tim, - "AdvType_deg": AdvType_deg, - "AdvType_cau": AdvType_cau, - "AdvType_mod": AdvType_mod, - "AdvType_sta": AdvType_sta, - "AdvType_ex": AdvType_ex, - "AdvType_adadj": AdvType_adadj, - "ConjType_oper": ConjType_oper, # cz, U, - "ConjType_comp": ConjType_comp, # cz, U, - "Connegative_yes": Connegative_yes, # fi, - "Derivation_minen": Derivation_minen, # fi, - "Derivation_sti": Derivation_sti, # fi, - "Derivation_inen": Derivation_inen, # fi, - "Derivation_lainen": Derivation_lainen, # fi, - "Derivation_ja": Derivation_ja, # fi, - "Derivation_ton": Derivation_ton, # fi, - "Derivation_vs": Derivation_vs, # fi, - "Derivation_ttain": Derivation_ttain, # fi, - "Derivation_ttaa": Derivation_ttaa, # fi, - "Echo_rdp": Echo_rdp, # U, - "Echo_ech": Echo_ech, # U, - "Foreign_foreign": Foreign_foreign, # cz, fi, U, - "Foreign_fscript": Foreign_fscript, # cz, fi, U, - "Foreign_tscript": Foreign_tscript, # cz, U, - "Foreign_yes": Foreign_yes, # sl, - "Gender_dat_masc": Gender_dat_masc, # bq, U, - "Gender_dat_fem": Gender_dat_fem, # bq, U, - "Gender_erg_masc": Gender_erg_masc, # bq, - "Gender_erg_fem": Gender_erg_fem, # bq, - "Gender_psor_masc": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut": Gender_psor_neut, # sl, - "Hyph_yes": Hyph_yes, # cz, U, - "InfForm_one": InfForm_one, # fi, - "InfForm_two": InfForm_two, # fi, - "InfForm_three": InfForm_three, # fi, - "NameType_geo": NameType_geo, # U, cz, - "NameType_prs": NameType_prs, # U, cz, - "NameType_giv": NameType_giv, # U, cz, - "NameType_sur": NameType_sur, # U, cz, - "NameType_nat": NameType_nat, # U, cz, - "NameType_com": NameType_com, # U, cz, - "NameType_pro": NameType_pro, # U, cz, - "NameType_oth": NameType_oth, # U, cz, - "NounType_com": NounType_com, # U, - "NounType_prop": NounType_prop, # U, - "NounType_class": NounType_class, # U, - "Number_abs_sing": Number_abs_sing, # bq, U, - "Number_abs_plur": Number_abs_plur, # bq, U, - "Number_dat_sing": Number_dat_sing, # bq, U, - "Number_dat_plur": Number_dat_plur, # bq, U, - "Number_erg_sing": Number_erg_sing, # bq, U, - "Number_erg_plur": Number_erg_plur, # bq, U, - "Number_psee_sing": Number_psee_sing, # U, - "Number_psee_plur": Number_psee_plur, # U, - "Number_psor_sing": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur": Number_psor_plur, # cz, fi, sl, U, - "Number_pauc": Number_pauc, # U20 - "Number_grpa": Number_grpa, # U20 - "Number_grpl": Number_grpl, # U20 - "Number_inv": Number_inv, # U20 - "NumForm_digit": NumForm_digit, # cz, sl, U, - "NumForm_roman": NumForm_roman, # cz, sl, U, - "NumForm_word": NumForm_word, # cz, sl, U, - "NumValue_one": NumValue_one, # cz, U, - "NumValue_two": NumValue_two, # cz, U, - "NumValue_three": NumValue_three, # cz, U, - "PartForm_pres": PartForm_pres, # fi, - "PartForm_past": PartForm_past, # fi, - "PartForm_agt": PartForm_agt, # fi, - "PartForm_neg": PartForm_neg, # fi, - "PartType_mod": PartType_mod, # U, - "PartType_emp": PartType_emp, # U, - "PartType_res": PartType_res, # U, - "PartType_inf": PartType_inf, # U, - "PartType_vbp": PartType_vbp, # U, - "Person_abs_one": Person_abs_one, # bq, U, - "Person_abs_two": Person_abs_two, # bq, U, - "Person_abs_three": Person_abs_three, # bq, U, - "Person_dat_one": Person_dat_one, # bq, U, - "Person_dat_two": Person_dat_two, # bq, U, - "Person_dat_three": Person_dat_three, # bq, U, - "Person_erg_one": Person_erg_one, # bq, U, - "Person_erg_two": Person_erg_two, # bq, U, - "Person_erg_three": Person_erg_three, # bq, U, - "Person_psor_one": Person_psor_one, # fi, U, - "Person_psor_two": Person_psor_two, # fi, U, - "Person_psor_three": Person_psor_three, # fi, U, - "Person_zero": Person_zero, # U20 - "Person_four": Person_four, # U20 - "Polite_inf": Polite_inf, # bq, U, - "Polite_pol": Polite_pol, # bq, U, - "Polite_abs_inf": Polite_abs_inf, # bq, U, - "Polite_abs_pol": Polite_abs_pol, # bq, U, - "Polite_erg_inf": Polite_erg_inf, # bq, U, - "Polite_erg_pol": Polite_erg_pol, # bq, U, - "Polite_dat_inf": Polite_dat_inf, # bq, U, - "Polite_dat_pol": Polite_dat_pol, # bq, U, - "Polite_infm": Polite_infm, # U20 - "Polite_form": Polite_form, # U20 - "Polite_form_elev": Polite_form_elev, # U20 - "Polite_form_humb": Polite_form_humb, # U20 - "Prefix_yes": Prefix_yes, # U, - "PrepCase_npr": PrepCase_npr, # cz, - "PrepCase_pre": PrepCase_pre, # U, - "PunctSide_ini": PunctSide_ini, # U, - "PunctSide_fin": PunctSide_fin, # U, - "PunctType_peri": PunctType_peri, # U, - "PunctType_qest": PunctType_qest, # U, - "PunctType_excl": PunctType_excl, # U, - "PunctType_quot": PunctType_quot, # U, - "PunctType_brck": PunctType_brck, # U, - "PunctType_comm": PunctType_comm, # U, - "PunctType_colo": PunctType_colo, # U, - "PunctType_semi": PunctType_semi, # U, - "PunctType_dash": PunctType_dash, # U, - "Style_arch": Style_arch, # cz, fi, U, - "Style_rare": Style_rare, # cz, fi, U, - "Style_poet": Style_poet, # cz, U, - "Style_norm": Style_norm, # cz, U, - "Style_coll": Style_coll, # cz, U, - "Style_vrnc": Style_vrnc, # cz, U, - "Style_sing": Style_sing, # cz, U, - "Style_expr": Style_expr, # cz, U, - "Style_derg": Style_derg, # cz, U, - "Style_vulg": Style_vulg, # cz, U, - "Style_yes": Style_yes, # fi, U, - "StyleVariant_styleShort": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl, - "VerbType_aux": VerbType_aux, # U, - "VerbType_cop": VerbType_cop, # U, - "VerbType_mod": VerbType_mod, # U, - "VerbType_light": VerbType_light, # U, + "DEPRECATED001": DEPRECATED001, + "DEPRECATED002": DEPRECATED002, + "DEPRECATED003": DEPRECATED003, + "DEPRECATED004": DEPRECATED004, + "DEPRECATED005": DEPRECATED005, + "DEPRECATED006": DEPRECATED006, + "DEPRECATED007": DEPRECATED007, + "DEPRECATED008": DEPRECATED008, + "DEPRECATED009": DEPRECATED009, + "DEPRECATED010": DEPRECATED010, + "DEPRECATED011": DEPRECATED011, + "DEPRECATED012": DEPRECATED012, + "DEPRECATED013": DEPRECATED013, + "DEPRECATED014": DEPRECATED014, + "DEPRECATED015": DEPRECATED015, + "DEPRECATED016": DEPRECATED016, + "DEPRECATED017": DEPRECATED017, + "DEPRECATED018": DEPRECATED018, + "DEPRECATED019": DEPRECATED019, + "DEPRECATED020": DEPRECATED020, + "DEPRECATED021": DEPRECATED021, + "DEPRECATED022": DEPRECATED022, + "DEPRECATED023": DEPRECATED023, + "DEPRECATED024": DEPRECATED024, + "DEPRECATED025": DEPRECATED025, + "DEPRECATED026": DEPRECATED026, + "DEPRECATED027": DEPRECATED027, + "DEPRECATED028": DEPRECATED028, + "DEPRECATED029": DEPRECATED029, + "DEPRECATED030": DEPRECATED030, + "DEPRECATED031": DEPRECATED031, + "DEPRECATED032": DEPRECATED032, + "DEPRECATED033": DEPRECATED033, + "DEPRECATED034": DEPRECATED034, + "DEPRECATED035": DEPRECATED035, + "DEPRECATED036": DEPRECATED036, + "DEPRECATED037": DEPRECATED037, + "DEPRECATED038": DEPRECATED038, + "DEPRECATED039": DEPRECATED039, + "DEPRECATED040": DEPRECATED040, + "DEPRECATED041": DEPRECATED041, + "DEPRECATED042": DEPRECATED042, + "DEPRECATED043": DEPRECATED043, + "DEPRECATED044": DEPRECATED044, + "DEPRECATED045": DEPRECATED045, + "DEPRECATED046": DEPRECATED046, + "DEPRECATED047": DEPRECATED047, + "DEPRECATED048": DEPRECATED048, + "DEPRECATED049": DEPRECATED049, + "DEPRECATED050": DEPRECATED050, + "DEPRECATED051": DEPRECATED051, + "DEPRECATED052": DEPRECATED052, + "DEPRECATED053": DEPRECATED053, + "DEPRECATED054": DEPRECATED054, + "DEPRECATED055": DEPRECATED055, + "DEPRECATED056": DEPRECATED056, + "DEPRECATED057": DEPRECATED057, + "DEPRECATED058": DEPRECATED058, + "DEPRECATED059": DEPRECATED059, + "DEPRECATED060": DEPRECATED060, + "DEPRECATED061": DEPRECATED061, + "DEPRECATED062": DEPRECATED062, + "DEPRECATED063": DEPRECATED063, + "DEPRECATED064": DEPRECATED064, + "DEPRECATED065": DEPRECATED065, + "DEPRECATED066": DEPRECATED066, + "DEPRECATED067": DEPRECATED067, + "DEPRECATED068": DEPRECATED068, + "DEPRECATED069": DEPRECATED069, + "DEPRECATED070": DEPRECATED070, + "DEPRECATED071": DEPRECATED071, + "DEPRECATED072": DEPRECATED072, + "DEPRECATED073": DEPRECATED073, + "DEPRECATED074": DEPRECATED074, + "DEPRECATED075": DEPRECATED075, + "DEPRECATED076": DEPRECATED076, + "DEPRECATED077": DEPRECATED077, + "DEPRECATED078": DEPRECATED078, + "DEPRECATED079": DEPRECATED079, + "DEPRECATED080": DEPRECATED080, + "DEPRECATED081": DEPRECATED081, + "DEPRECATED082": DEPRECATED082, + "DEPRECATED083": DEPRECATED083, + "DEPRECATED084": DEPRECATED084, + "DEPRECATED085": DEPRECATED085, + "DEPRECATED086": DEPRECATED086, + "DEPRECATED087": DEPRECATED087, + "DEPRECATED088": DEPRECATED088, + "DEPRECATED089": DEPRECATED089, + "DEPRECATED090": DEPRECATED090, + "DEPRECATED091": DEPRECATED091, + "DEPRECATED092": DEPRECATED092, + "DEPRECATED093": DEPRECATED093, + "DEPRECATED094": DEPRECATED094, + "DEPRECATED095": DEPRECATED095, + "DEPRECATED096": DEPRECATED096, + "DEPRECATED097": DEPRECATED097, + "DEPRECATED098": DEPRECATED098, + "DEPRECATED099": DEPRECATED099, + "DEPRECATED100": DEPRECATED100, + "DEPRECATED101": DEPRECATED101, + "DEPRECATED102": DEPRECATED102, + "DEPRECATED103": DEPRECATED103, + "DEPRECATED104": DEPRECATED104, + "DEPRECATED105": DEPRECATED105, + "DEPRECATED106": DEPRECATED106, + "DEPRECATED107": DEPRECATED107, + "DEPRECATED108": DEPRECATED108, + "DEPRECATED109": DEPRECATED109, + "DEPRECATED110": DEPRECATED110, + "DEPRECATED111": DEPRECATED111, + "DEPRECATED112": DEPRECATED112, + "DEPRECATED113": DEPRECATED113, + "DEPRECATED114": DEPRECATED114, + "DEPRECATED115": DEPRECATED115, + "DEPRECATED116": DEPRECATED116, + "DEPRECATED117": DEPRECATED117, + "DEPRECATED118": DEPRECATED118, + "DEPRECATED119": DEPRECATED119, + "DEPRECATED120": DEPRECATED120, + "DEPRECATED121": DEPRECATED121, + "DEPRECATED122": DEPRECATED122, + "DEPRECATED123": DEPRECATED123, + "DEPRECATED124": DEPRECATED124, + "DEPRECATED125": DEPRECATED125, + "DEPRECATED126": DEPRECATED126, + "DEPRECATED127": DEPRECATED127, + "DEPRECATED128": DEPRECATED128, + "DEPRECATED129": DEPRECATED129, + "DEPRECATED130": DEPRECATED130, + "DEPRECATED131": DEPRECATED131, + "DEPRECATED132": DEPRECATED132, + "DEPRECATED133": DEPRECATED133, + "DEPRECATED134": DEPRECATED134, + "DEPRECATED135": DEPRECATED135, + "DEPRECATED136": DEPRECATED136, + "DEPRECATED137": DEPRECATED137, + "DEPRECATED138": DEPRECATED138, + "DEPRECATED139": DEPRECATED139, + "DEPRECATED140": DEPRECATED140, + "DEPRECATED141": DEPRECATED141, + "DEPRECATED142": DEPRECATED142, + "DEPRECATED143": DEPRECATED143, + "DEPRECATED144": DEPRECATED144, + "DEPRECATED145": DEPRECATED145, + "DEPRECATED146": DEPRECATED146, + "DEPRECATED147": DEPRECATED147, + "DEPRECATED148": DEPRECATED148, + "DEPRECATED149": DEPRECATED149, + "DEPRECATED150": DEPRECATED150, + "DEPRECATED151": DEPRECATED151, + "DEPRECATED152": DEPRECATED152, + "DEPRECATED153": DEPRECATED153, + "DEPRECATED154": DEPRECATED154, + "DEPRECATED155": DEPRECATED155, + "DEPRECATED156": DEPRECATED156, + "DEPRECATED157": DEPRECATED157, + "DEPRECATED158": DEPRECATED158, + "DEPRECATED159": DEPRECATED159, + "DEPRECATED160": DEPRECATED160, + "DEPRECATED161": DEPRECATED161, + "DEPRECATED162": DEPRECATED162, + "DEPRECATED163": DEPRECATED163, + "DEPRECATED164": DEPRECATED164, + "DEPRECATED165": DEPRECATED165, + "DEPRECATED166": DEPRECATED166, + "DEPRECATED167": DEPRECATED167, + "DEPRECATED168": DEPRECATED168, + "DEPRECATED169": DEPRECATED169, + "DEPRECATED170": DEPRECATED170, + "DEPRECATED171": DEPRECATED171, + "DEPRECATED172": DEPRECATED172, + "DEPRECATED173": DEPRECATED173, + "DEPRECATED174": DEPRECATED174, + "DEPRECATED175": DEPRECATED175, + "DEPRECATED176": DEPRECATED176, + "DEPRECATED177": DEPRECATED177, + "DEPRECATED178": DEPRECATED178, + "DEPRECATED179": DEPRECATED179, + "DEPRECATED180": DEPRECATED180, + "DEPRECATED181": DEPRECATED181, + "DEPRECATED182": DEPRECATED182, + "DEPRECATED183": DEPRECATED183, + "DEPRECATED184": DEPRECATED184, + "DEPRECATED185": DEPRECATED185, + "DEPRECATED186": DEPRECATED186, + "DEPRECATED187": DEPRECATED187, + "DEPRECATED188": DEPRECATED188, + "DEPRECATED189": DEPRECATED189, + "DEPRECATED190": DEPRECATED190, + "DEPRECATED191": DEPRECATED191, + "DEPRECATED192": DEPRECATED192, + "DEPRECATED193": DEPRECATED193, + "DEPRECATED194": DEPRECATED194, + "DEPRECATED195": DEPRECATED195, + "DEPRECATED196": DEPRECATED196, + "DEPRECATED197": DEPRECATED197, + "DEPRECATED198": DEPRECATED198, + "DEPRECATED199": DEPRECATED199, + "DEPRECATED200": DEPRECATED200, + "DEPRECATED201": DEPRECATED201, + "DEPRECATED202": DEPRECATED202, + "DEPRECATED203": DEPRECATED203, + "DEPRECATED204": DEPRECATED204, + "DEPRECATED205": DEPRECATED205, + "DEPRECATED206": DEPRECATED206, + "DEPRECATED207": DEPRECATED207, + "DEPRECATED208": DEPRECATED208, + "DEPRECATED209": DEPRECATED209, + "DEPRECATED210": DEPRECATED210, + "DEPRECATED211": DEPRECATED211, + "DEPRECATED212": DEPRECATED212, + "DEPRECATED213": DEPRECATED213, + "DEPRECATED214": DEPRECATED214, + "DEPRECATED215": DEPRECATED215, + "DEPRECATED216": DEPRECATED216, + "DEPRECATED217": DEPRECATED217, + "DEPRECATED218": DEPRECATED218, + "DEPRECATED219": DEPRECATED219, + "DEPRECATED220": DEPRECATED220, + "DEPRECATED221": DEPRECATED221, + "DEPRECATED222": DEPRECATED222, + "DEPRECATED223": DEPRECATED223, + "DEPRECATED224": DEPRECATED224, + "DEPRECATED225": DEPRECATED225, + "DEPRECATED226": DEPRECATED226, + "DEPRECATED227": DEPRECATED227, + "DEPRECATED228": DEPRECATED228, + "DEPRECATED229": DEPRECATED229, + "DEPRECATED230": DEPRECATED230, + "DEPRECATED231": DEPRECATED231, + "DEPRECATED232": DEPRECATED232, + "DEPRECATED233": DEPRECATED233, + "DEPRECATED234": DEPRECATED234, + "DEPRECATED235": DEPRECATED235, + "DEPRECATED236": DEPRECATED236, + "DEPRECATED237": DEPRECATED237, + "DEPRECATED238": DEPRECATED238, + "DEPRECATED239": DEPRECATED239, + "DEPRECATED240": DEPRECATED240, + "DEPRECATED241": DEPRECATED241, + "DEPRECATED242": DEPRECATED242, + "DEPRECATED243": DEPRECATED243, + "DEPRECATED244": DEPRECATED244, + "DEPRECATED245": DEPRECATED245, + "DEPRECATED246": DEPRECATED246, + "DEPRECATED247": DEPRECATED247, + "DEPRECATED248": DEPRECATED248, + "DEPRECATED249": DEPRECATED249, + "DEPRECATED250": DEPRECATED250, + "DEPRECATED251": DEPRECATED251, + "DEPRECATED252": DEPRECATED252, + "DEPRECATED253": DEPRECATED253, + "DEPRECATED254": DEPRECATED254, + "DEPRECATED255": DEPRECATED255, + "DEPRECATED256": DEPRECATED256, + "DEPRECATED257": DEPRECATED257, + "DEPRECATED258": DEPRECATED258, + "DEPRECATED259": DEPRECATED259, + "DEPRECATED260": DEPRECATED260, + "DEPRECATED261": DEPRECATED261, + "DEPRECATED262": DEPRECATED262, + "DEPRECATED263": DEPRECATED263, + "DEPRECATED264": DEPRECATED264, + "DEPRECATED265": DEPRECATED265, + "DEPRECATED266": DEPRECATED266, + "DEPRECATED267": DEPRECATED267, + "DEPRECATED268": DEPRECATED268, + "DEPRECATED269": DEPRECATED269, + "DEPRECATED270": DEPRECATED270, + "DEPRECATED271": DEPRECATED271, + "DEPRECATED272": DEPRECATED272, + "DEPRECATED273": DEPRECATED273, + "DEPRECATED274": DEPRECATED274, + "DEPRECATED275": DEPRECATED275, + "DEPRECATED276": DEPRECATED276, "PERSON": PERSON, "NORP": NORP, diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 67ebc06d6..82fb549ba 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -9,22 +9,52 @@ def i_has(en_tokenizer): return doc -def test_token_morph_id(i_has): - assert i_has[0].morph.id - assert i_has[1].morph.id != 0 - assert i_has[0].morph.id != i_has[1].morph.id +def test_token_morph_eq(i_has): + assert i_has[0].morph is not i_has[0].morph + assert i_has[0].morph == i_has[0].morph + assert i_has[0].morph != i_has[1].morph + + +def test_token_morph_key(i_has): + assert i_has[0].morph.key != 0 + assert i_has[1].morph.key != 0 + assert i_has[0].morph.key == i_has[0].morph.key + assert i_has[0].morph.key != i_has[1].morph.key def test_morph_props(i_has): - assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] - assert i_has[0].morph.pron_type_ == "PronType_prs" - assert i_has[1].morph.pron_type == 0 + assert i_has[0].morph.get("PronType") == ["PronType=prs"] + assert i_has[1].morph.get("PronType") == [] def test_morph_iter(i_has): - assert list(i_has[0].morph) == ["PronType_prs"] - assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"] + assert set(i_has[0].morph) == set(["PronType=prs"]) + assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]) def test_morph_get(i_has): - assert i_has[0].morph.get("pron_type") == "PronType_prs" + assert i_has[0].morph.get("PronType") == ["PronType=prs"] + + +def test_morph_set(i_has): + assert i_has[0].morph.get("PronType") == ["PronType=prs"] + # set by string + i_has[0].morph_ = "PronType=unk" + assert i_has[0].morph.get("PronType") == ["PronType=unk"] + # set by string, fields are alphabetized + i_has[0].morph_ = "PronType=123|NounType=unk" + assert i_has[0].morph_ == "NounType=unk|PronType=123" + # set by dict + i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"} + assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ" + # set by string with multiple values, fields and values are alphabetized + i_has[0].morph_ = "BType=c|AType=b,a" + assert i_has[0].morph_ == "AType=a,b|BType=c" + # set by dict with multiple values, fields and values are alphabetized + i_has[0].morph_ = {"AType": "b,a", "BType": "c"} + assert i_has[0].morph_ == "AType=a,b|BType=c" + + +def test_morph_str(i_has): + assert str(i_has[0].morph) == "PronType=prs" + assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin" diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py new file mode 100644 index 000000000..3bff4f924 --- /dev/null +++ b/spacy/tests/morphology/test_morph_converters.py @@ -0,0 +1,26 @@ +import pytest +from spacy.morphology import Morphology + + +def test_feats_converters(): + feats = "Case=dat,gen|Number=sing" + feats_dict = {"Case": "dat,gen", "Number": "sing"} + feats_list = feats.split(Morphology.FEATURE_SEP) + + # simple conversions + assert Morphology.list_to_feats(feats_list) == feats + assert Morphology.dict_to_feats(feats_dict) == feats + assert Morphology.feats_to_dict(feats) == feats_dict + + # roundtrips + assert Morphology.dict_to_feats(Morphology.feats_to_dict(feats)) == feats + assert Morphology.feats_to_dict(Morphology.dict_to_feats(feats_dict)) == feats_dict + + # unsorted input is normalized + unsorted_feats = "Number=sing|Case=gen,dat" + unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"} + unsorted_feats_list = feats.split(Morphology.FEATURE_SEP) + assert Morphology.feats_to_dict(unsorted_feats) == feats_dict + assert Morphology.dict_to_feats(unsorted_feats_dict) == feats + assert Morphology.list_to_feats(unsorted_feats_list) == feats + assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 4cf6b1206..0d8d7dea9 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -16,32 +16,30 @@ def test_init(morphology): def test_add_morphology_with_string_names(morphology): - morphology.add({"Case_gen", "Number_sing"}) + morphology.add({"Case": "gen", "Number": "sing"}) def test_add_morphology_with_int_ids(morphology): - morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) + morphology.strings.add("Case") + morphology.strings.add("gen") + morphology.strings.add("Number") + morphology.strings.add("sing") + morphology.add({get_string_id("Case"): get_string_id("gen"), get_string_id("Number"): get_string_id("sing")}) def test_add_morphology_with_mix_strings_and_ints(morphology): - morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"}) + morphology.strings.add("PunctSide") + morphology.strings.add("ini") + morphology.add({get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"}) def test_morphology_tags_hash_distinctly(morphology): - tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"}) - tag2 = morphology.add({"Case_gen", "Number_sing"}) + tag1 = morphology.add({"PunctSide": "ini", "VerbType": "aux"}) + tag2 = morphology.add({"Case": "gen", "Number": "sing"}) assert tag1 != tag2 def test_morphology_tags_hash_independent_of_order(morphology): - tag1 = morphology.add({"Case_gen", "Number_sing"}) - tag2 = morphology.add({"Number_sing", "Case_gen"}) + tag1 = morphology.add({"Case": "gen", "Number": "sing"}) + tag2 = morphology.add({"Number": "sing", "Case": "gen"}) assert tag1 == tag2 - - -def test_update_morphology_tag(morphology): - tag1 = morphology.add({"Case_gen"}) - tag2 = morphology.update(tag1, {"Number_sing"}) - assert tag1 != tag2 - tag3 = morphology.add({"Number_sing", "Case_gen"}) - assert tag2 == tag3 diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index a3148aa90..bfca72853 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -2,7 +2,7 @@ import pytest import random from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER -from spacy.symbols import POS, VERB, VerbForm_inf +from spacy.symbols import POS, VERB from spacy.vocab import Vocab from spacy.language import Language from spacy.lemmatizer import Lemmatizer @@ -164,7 +164,7 @@ def test_issue590(en_vocab): def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] - tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} + tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 7d81c3148..aaff951e5 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -8,7 +8,7 @@ from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups -from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part +from spacy.symbols import ORTH, LEMMA, POS, VERB def test_issue1061(): @@ -88,7 +88,7 @@ def test_issue1375(): def test_issue1387(): - tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} + tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}} lookups = Lookups() lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 88428709b..1aefa2b7c 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -2,5 +2,6 @@ from .doc import Doc from .token import Token from .span import Span from ._serialize import DocBin +from .morphanalysis import MorphAnalysis -__all__ = ["Doc", "Token", "Span", "DocBin"] +__all__ = ["Doc", "Token", "Span", "DocBin", "MorphAnalysis"] diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd index 22844454a..9510875c9 100644 --- a/spacy/tokens/morphanalysis.pxd +++ b/spacy/tokens/morphanalysis.pxd @@ -5,5 +5,5 @@ from ..structs cimport MorphAnalysisC cdef class MorphAnalysis: cdef readonly Vocab vocab - cdef hash_t key + cdef readonly hash_t key cdef MorphAnalysisC c diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e09870741..ed987f4e4 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,15 +1,14 @@ from libc.string cimport memset +cimport numpy as np from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t -from ..morphology cimport list_features, check_feature, get_field, tag_to_json - -from ..strings import get_string_id +from ..morphology cimport list_features, check_feature, get_by_field cdef class MorphAnalysis: """Control access to morphological features for a token.""" - def __init__(self, Vocab vocab, features=tuple()): + def __init__(self, Vocab vocab, features=dict()): self.vocab = vocab self.key = self.vocab.morphology.add(features) analysis = self.vocab.morphology.tags.get(self.key) @@ -33,7 +32,7 @@ cdef class MorphAnalysis: def __contains__(self, feature): """Test whether the morphological analysis contains some feature.""" - cdef attr_t feat_id = get_string_id(feature) + cdef attr_t feat_id = self.vocab.strings.as_int(feature) return check_feature(&self.c, feat_id) def __iter__(self): @@ -55,369 +54,28 @@ cdef class MorphAnalysis: def __hash__(self): return self.key - def get(self, unicode field): + def __eq__(self, other): + return self.key == other.key + + def __ne__(self, other): + return self.key != other.key + + def get(self, field): """Retrieve a feature by field.""" - cdef int field_id = self.vocab.morphology._feat_map.attr2field[field] - return self.vocab.strings[get_field(&self.c, field_id)] + cdef attr_t field_id = self.vocab.strings.as_int(field) + cdef np.ndarray results = get_by_field(&self.c, field_id) + return [self.vocab.strings[result] for result in results] def to_json(self): - """Produce a json serializable representation, which will be a list of - strings. + """Produce a json serializable representation as a UD FEATS-style + string. """ - return tag_to_json(&self.c) - - @property - def is_base_form(self): - raise NotImplementedError - - @property - def pos(self): - return self.c.pos - - @property - def pos_(self): - return self.vocab.strings[self.c.pos] - - property id: - def __get__(self): - return self.key - - property abbr: - def __get__(self): - return self.c.abbr - - property adp_type: - def __get__(self): - return self.c.adp_type - - property adv_type: - def __get__(self): - return self.c.adv_type - - property animacy: - def __get__(self): - return self.c.animacy - - property aspect: - def __get__(self): - return self.c.aspect - - property case: - def __get__(self): - return self.c.case - - property conj_type: - def __get__(self): - return self.c.conj_type - - property connegative: - def __get__(self): - return self.c.connegative - - property definite: - def __get__(self): - return self.c.definite - - property degree: - def __get__(self): - return self.c.degree - - property derivation: - def __get__(self): - return self.c.derivation - - property echo: - def __get__(self): - return self.c.echo - - property foreign: - def __get__(self): - return self.c.foreign - - property gender: - def __get__(self): - return self.c.gender - - property hyph: - def __get__(self): - return self.c.hyph - - property inf_form: - def __get__(self): - return self.c.inf_form - - property mood: - def __get__(self): - return self.c.mood - - property name_type: - def __get__(self): - return self.c.name_type - - property negative: - def __get__(self): - return self.c.negative - - property noun_type: - def __get__(self): - return self.c.noun_type - - property number: - def __get__(self): - return self.c.number - - property num_form: - def __get__(self): - return self.c.num_form - - property num_type: - def __get__(self): - return self.c.num_type - - property num_value: - def __get__(self): - return self.c.num_value - - property part_form: - def __get__(self): - return self.c.part_form - - property part_type: - def __get__(self): - return self.c.part_type - - property person: - def __get__(self): - return self.c.person - - property polite: - def __get__(self): - return self.c.polite - - property polarity: - def __get__(self): - return self.c.polarity - - property poss: - def __get__(self): - return self.c.poss - - property prefix: - def __get__(self): - return self.c.prefix - - property prep_case: - def __get__(self): - return self.c.prep_case - - property pron_type: - def __get__(self): - return self.c.pron_type - - property punct_side: - def __get__(self): - return self.c.punct_side - - property punct_type: - def __get__(self): - return self.c.punct_type - - property reflex: - def __get__(self): - return self.c.reflex - - property style: - def __get__(self): - return self.c.style - - property style_variant: - def __get__(self): - return self.c.style_variant - - property tense: - def __get__(self): - return self.c.tense - - property typo: - def __get__(self): - return self.c.typo - - property verb_form: - def __get__(self): - return self.c.verb_form - - property voice: - def __get__(self): - return self.c.voice - - property verb_type: - def __get__(self): - return self.c.verb_type - - property abbr_: - def __get__(self): - return self.vocab.strings[self.c.abbr] - - property adp_type_: - def __get__(self): - return self.vocab.strings[self.c.adp_type] - - property adv_type_: - def __get__(self): - return self.vocab.strings[self.c.adv_type] - - property animacy_: - def __get__(self): - return self.vocab.strings[self.c.animacy] - - property aspect_: - def __get__(self): - return self.vocab.strings[self.c.aspect] - - property case_: - def __get__(self): - return self.vocab.strings[self.c.case] - - property conj_type_: - def __get__(self): - return self.vocab.strings[self.c.conj_type] - - property connegative_: - def __get__(self): - return self.vocab.strings[self.c.connegative] - - property definite_: - def __get__(self): - return self.vocab.strings[self.c.definite] - - property degree_: - def __get__(self): - return self.vocab.strings[self.c.degree] - - property derivation_: - def __get__(self): - return self.vocab.strings[self.c.derivation] - - property echo_: - def __get__(self): - return self.vocab.strings[self.c.echo] - - property foreign_: - def __get__(self): - return self.vocab.strings[self.c.foreign] - - property gender_: - def __get__(self): - return self.vocab.strings[self.c.gender] - - property hyph_: - def __get__(self): - return self.vocab.strings[self.c.hyph] - - property inf_form_: - def __get__(self): - return self.vocab.strings[self.c.inf_form] - - property name_type_: - def __get__(self): - return self.vocab.strings[self.c.name_type] - - property negative_: - def __get__(self): - return self.vocab.strings[self.c.negative] - - property mood_: - def __get__(self): - return self.vocab.strings[self.c.mood] - - property number_: - def __get__(self): - return self.vocab.strings[self.c.number] - - property num_form_: - def __get__(self): - return self.vocab.strings[self.c.num_form] - - property num_type_: - def __get__(self): - return self.vocab.strings[self.c.num_type] - - property num_value_: - def __get__(self): - return self.vocab.strings[self.c.num_value] - - property part_form_: - def __get__(self): - return self.vocab.strings[self.c.part_form] - - property part_type_: - def __get__(self): - return self.vocab.strings[self.c.part_type] - - property person_: - def __get__(self): - return self.vocab.strings[self.c.person] - - property polite_: - def __get__(self): - return self.vocab.strings[self.c.polite] - - property polarity_: - def __get__(self): - return self.vocab.strings[self.c.polarity] - - property poss_: - def __get__(self): - return self.vocab.strings[self.c.poss] - - property prefix_: - def __get__(self): - return self.vocab.strings[self.c.prefix] - - property prep_case_: - def __get__(self): - return self.vocab.strings[self.c.prep_case] - - property pron_type_: - def __get__(self): - return self.vocab.strings[self.c.pron_type] - - property punct_side_: - def __get__(self): - return self.vocab.strings[self.c.punct_side] - - property punct_type_: - def __get__(self): - return self.vocab.strings[self.c.punct_type] - - property reflex_: - def __get__(self): - return self.vocab.strings[self.c.reflex] - - property style_: - def __get__(self): - return self.vocab.strings[self.c.style] - - property style_variant_: - def __get__(self): - return self.vocab.strings[self.c.style_variant] - - property tense_: - def __get__(self): - return self.vocab.strings[self.c.tense] - - property typo_: - def __get__(self): - return self.vocab.strings[self.c.typo] - - property verb_form_: - def __get__(self): - return self.vocab.strings[self.c.verb_form] - - property voice_: - def __get__(self): - return self.vocab.strings[self.c.voice] - - property verb_type_: - def __get__(self): - return self.vocab.strings[self.c.verb_type] + morph_string = self.vocab.strings[self.c.key] + if morph_string == self.vocab.morphology.EMPTY_MORPH: + return "" + return morph_string + + def to_dict(self): + """Produce a dict representation. + """ + return self.vocab.morphology.feats_to_dict(self.to_json()) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8e6290187..b159fffc1 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -217,6 +217,14 @@ cdef class Token: def morph(self): return MorphAnalysis.from_id(self.vocab, self.c.morph) + property morph_: + def __get__(self): + return str(MorphAnalysis.from_id(self.vocab, self.c.morph)) + + def __set__(self, features): + cdef hash_t key = self.vocab.morphology.add(features) + self.c.morph = key + @property def lex_id(self): """RETURNS (int): Sequential ID of the token's lexical type.""" From 06b251dd1e5aa5fa7c6025d11448ccea3b875d91 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 28 Jan 2020 11:36:29 +0100 Subject: [PATCH 030/187] Add support for pos/morphs/lemmas in training data (#4941) Add support for pos/morphs/lemmas throughout `GoldParse`, `Example`, and `docs_to_json()`. --- spacy/gold.pxd | 6 +- spacy/gold.pyx | 135 +++++++++++++++++++++++---------------- spacy/tests/test_gold.py | 51 +++++++++++---- 3 files changed, 124 insertions(+), 68 deletions(-) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 5f0b49c9f..49dba16df 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -25,6 +25,7 @@ cdef class GoldParse: cdef public int loss cdef public list words cdef public list tags + cdef public list pos cdef public list morphs cdef public list lemmas cdef public list sent_starts @@ -44,11 +45,12 @@ cdef class TokenAnnotation: cdef public list ids cdef public list words cdef public list tags + cdef public list pos + cdef public list morphs + cdef public list lemmas cdef public list heads cdef public list deps cdef public list entities - cdef public list morphs - cdef public list lemmas cdef public list sent_starts cdef public list brackets diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0dfa32c84..eca801176 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -485,11 +485,12 @@ def json_to_examples(doc): words = [] ids = [] tags = [] + pos = [] + morphs = [] + lemmas = [] heads = [] labels = [] ner = [] - morphs = [] - lemmas = [] sent_starts = [] brackets = [] for sent in paragraph["sentences"]: @@ -498,14 +499,15 @@ def json_to_examples(doc): words.append(token["orth"]) ids.append(token.get('id', sent_start_i + i)) tags.append(token.get('tag', "-")) + pos.append(token.get("pos", "")) + morphs.append(token.get("morph", "")) + lemmas.append(token.get("lemma", "")) heads.append(token.get("head", 0) + sent_start_i + i) labels.append(token.get("dep", "")) # Ensure ROOT label is case-insensitive if labels[-1].lower() == "root": labels[-1] = "ROOT" ner.append(token.get("ner", "-")) - morphs.append(token.get("morph", {})) - lemmas.append(token.get("lemma", "")) if i == 0: sent_starts.append(1) else: @@ -518,8 +520,9 @@ def json_to_examples(doc): for cat in paragraph.get("cats", {}): cats[cat["label"]] = cat["value"] example.set_token_annotation(ids=ids, words=words, tags=tags, - heads=heads, deps=labels, entities=ner, morphs=morphs, - lemmas=lemmas, sent_starts=sent_starts, brackets=brackets) + pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, + deps=labels, entities=ner, sent_starts=sent_starts, + brackets=brackets) example.set_doc_annotation(cats=cats) yield example @@ -632,17 +635,18 @@ def _consume_ent(tags): cdef class TokenAnnotation: - def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, - entities=None, morphs=None, lemmas=None, sent_starts=None, + def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None, + lemmas=None, heads=None, deps=None, entities=None, sent_starts=None, brackets=None): self.ids = ids if ids else [] self.words = words if words else [] self.tags = tags if tags else [] + self.pos = pos if pos else [] + self.morphs = morphs if morphs else [] + self.lemmas = lemmas if lemmas else [] self.heads = heads if heads else [] self.deps = deps if deps else [] self.entities = entities if entities else [] - self.morphs = morphs if morphs else [] - self.lemmas = lemmas if lemmas else [] self.sent_starts = sent_starts if sent_starts else [] self.brackets = brackets if brackets else [] @@ -651,11 +655,12 @@ cdef class TokenAnnotation: return cls(ids=token_dict.get("ids", None), words=token_dict.get("words", None), tags=token_dict.get("tags", None), + pos=token_dict.get("pos", None), + morphs=token_dict.get("morphs", None), + lemmas=token_dict.get("lemmas", None), heads=token_dict.get("heads", None), deps=token_dict.get("deps", None), entities=token_dict.get("entities", None), - morphs=token_dict.get("morphs", None), - lemmas=token_dict.get("lemmas", None), sent_starts=token_dict.get("sent_starts", None), brackets=token_dict.get("brackets", None)) @@ -663,11 +668,12 @@ cdef class TokenAnnotation: return {"ids": self.ids, "words": self.words, "tags": self.tags, + "pos": self.pos, + "morphs": self.morphs, + "lemmas": self.lemmas, "heads": self.heads, "deps": self.deps, "entities": self.entities, - "morphs": self.morphs, - "lemmas": self.lemmas, "sent_starts": self.sent_starts, "brackets": self.brackets} @@ -680,6 +686,15 @@ cdef class TokenAnnotation: def get_tag(self, i): return self.tags[i] if i < len(self.tags) else "-" + def get_pos(self, i): + return self.pos[i] if i < len(self.pos) else "" + + def get_morph(self, i): + return self.morphs[i] if i < len(self.morphs) else "" + + def get_lemma(self, i): + return self.lemmas[i] if i < len(self.lemmas) else "" + def get_head(self, i): return self.heads[i] if i < len(self.heads) else i @@ -689,12 +704,6 @@ cdef class TokenAnnotation: def get_entity(self, i): return self.entities[i] if i < len(self.entities) else "-" - def get_morph(self, i): - return self.morphs[i] if i < len(self.morphs) else set() - - def get_lemma(self, i): - return self.lemmas[i] if i < len(self.lemmas) else "" - def get_sent_start(self, i): return self.sent_starts[i] if i < len(self.sent_starts) else None @@ -756,12 +765,12 @@ cdef class Example: self.goldparse = gold return self.goldparse - def set_token_annotation(self, ids=None, words=None, tags=None, heads=None, - deps=None, entities=None, morphs=None, lemmas=None, - sent_starts=None, brackets=None): + def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, + morphs=None, lemmas=None, heads=None, deps=None, + entities=None, sent_starts=None, brackets=None): self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, - heads=heads, deps=deps, entities=entities, - morphs=morphs, lemmas=lemmas, + pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, + deps=deps, entities=entities, sent_starts=sent_starts, brackets=brackets) def set_doc_annotation(self, cats=None, links=None): @@ -774,8 +783,8 @@ cdef class Example: """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" s_example = Example(doc=None, doc_annotation=self.doc_annotation) - s_ids, s_words, s_tags, s_heads = [], [], [], [] - s_deps, s_ents, s_morphs, s_lemmas, s_sent_starts = [], [], [], [], [] + s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] + s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_brackets = [] sent_start_i = 0 t = self.token_annotation @@ -783,31 +792,33 @@ cdef class Example: for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: s_example.set_token_annotation(ids=s_ids, - words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, - entities=s_ents, morphs=s_morphs, lemmas=s_lemmas, - sent_starts=s_sent_starts, brackets=s_brackets) + words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, + lemmas=s_lemmas, heads=s_heads, deps=s_deps, + entities=s_ents, sent_starts=s_sent_starts, + brackets=s_brackets) split_examples.append(s_example) s_example = Example(doc=None, doc_annotation=self.doc_annotation) - s_ids, s_words, s_tags, s_heads = [], [], [], [] + s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] s_sent_starts, s_brackets = [], [] sent_start_i = i s_ids.append(t.get_id(i)) s_words.append(t.get_word(i)) s_tags.append(t.get_tag(i)) + s_pos.append(t.get_pos(i)) + s_morphs.append(t.get_morph(i)) + s_lemmas.append(t.get_lemma(i)) s_heads.append(t.get_head(i) - sent_start_i) s_deps.append(t.get_dep(i)) s_ents.append(t.get_entity(i)) - s_morphs.append(t.get_morph(i)) - s_lemmas.append(t.get_lemma(i)) s_sent_starts.append(t.get_sent_start(i)) s_brackets.extend((b[0] - sent_start_i, b[1] - sent_start_i, b[2]) for b in t.brackets if b[0] == i) i += 1 s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, - heads=s_heads, deps=s_deps, entities=s_ents, - morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts, + pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, + deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, brackets=s_brackets) split_examples.append(s_example) return split_examples @@ -911,11 +922,12 @@ cdef class GoldParse: def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): return cls(doc, words=token_annotation.words, tags=token_annotation.tags, + pos=token_annotation.pos, + morphs=token_annotation.morphs, + lemmas=token_annotation.lemmas, heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities, - morphs=token_annotation.morphs, - lemmas=token_annotation.lemmas, sent_starts=token_annotation.sent_starts, cats=doc_annotation.cats, links=doc_annotation.links, @@ -927,18 +939,25 @@ cdef class GoldParse: ids = list(range(len(self.words))) return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, - heads=self.heads, deps=self.labels, - entities=self.ner, morphs=self.morphs, - sent_starts=self.sent_starts, lemmas=self.lemmas) + pos=self.pos, morphs=self.morphs, + lemmas=self.lemmas, heads=self.heads, + deps=self.labels, entities=self.ner, + sent_starts=self.sent_starts) - def __init__(self, doc, words=None, tags=None, morphs=None, lemmas=None, - sent_starts=None, heads=None, deps=None, entities=None, - make_projective=False, cats=None, links=None): + def __init__(self, doc, words=None, tags=None, pos=None, morphs=None, + lemmas=None, heads=None, deps=None, entities=None, + sent_starts=None, make_projective=False, cats=None, + links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. doc (Doc): The document the annotations refer to. words (iterable): A sequence of unicode word strings. tags (iterable): A sequence of strings, representing tag annotations. + pos (iterable): A sequence of strings, representing UPOS annotations. + morphs (iterable): A sequence of strings, representing morph + annotations. + lemmas (iterable): A sequence of strings, representing lemma + annotations. heads (iterable): A sequence of integers, representing syntactic head offsets. deps (iterable): A sequence of strings, representing the syntactic @@ -978,14 +997,16 @@ cdef class GoldParse: words = [token.text for token in doc] if not tags: tags = [None for _ in words] - if not heads: - heads = [None for _ in words] - if not deps: - deps = [None for _ in words] + if not pos: + pos = [None for _ in words] if not morphs: morphs = [None for _ in words] if not lemmas: lemmas = [None for _ in words] + if not heads: + heads = [None for _ in words] + if not deps: + deps = [None for _ in words] if not sent_starts: sent_starts = [None for _ in words] if entities is None: @@ -1010,11 +1031,12 @@ cdef class GoldParse: self.words = [None] * len(doc) self.tags = [None] * len(doc) + self.pos = [None] * len(doc) + self.morphs = [None] * len(doc) + self.lemmas = [None] * len(doc) self.heads = [None] * len(doc) self.labels = [None] * len(doc) self.ner = [None] * len(doc) - self.morphs = [None] * len(doc) - self.lemmas = [None] * len(doc) self.sent_starts = [None] * len(doc) # This needs to be done before we align the words @@ -1034,24 +1056,26 @@ cdef class GoldParse: self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.orig = TokenAnnotation(ids=list(range(len(words))), - words=words, tags=tags, heads=heads, deps=deps, - entities=entities, morphs=morphs, lemmas=lemmas, + words=words, tags=tags, pos=pos, morphs=morphs, + lemmas=lemmas, heads=heads, deps=deps, entities=entities, sent_starts=sent_starts, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): if doc[i].text.isspace(): self.words[i] = doc[i].text self.tags[i] = "_SP" + self.pos[i] = "SPACE" + self.morphs[i] = None + self.lemmas[i] = None self.heads[i] = None self.labels[i] = None self.ner[i] = None - self.morphs[i] = set() - self.lemmas[i] = None self.sent_starts[i] = 0 if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] + self.pos[i] = pos[i2j_multi[i]] self.morphs[i] = morphs[i2j_multi[i]] self.lemmas[i] = lemmas[i2j_multi[i]] self.sent_starts[i] = sent_starts[i2j_multi[i]] @@ -1093,6 +1117,7 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] + self.pos[i] = pos[gold_i] self.morphs[i] = morphs[gold_i] self.lemmas[i] = lemmas[gold_i] self.sent_starts[i] = sent_starts[gold_i] @@ -1156,9 +1181,11 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"): json_sent = {"tokens": [], "brackets": []} for token in sent: json_token = {"id": token.i, "orth": token.text} - json_token["lemma"] = token.lemma_ if doc.is_tagged: json_token["tag"] = token.tag_ + json_token["pos"] = token.pos_ + json_token["morph"] = token.morph_ + json_token["lemma"] = token.lemma_ if doc.is_parsed: json_token["head"] = token.head.i-token.i json_token["dep"] = token.dep_ diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 46c54b879..7fe8aab73 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,12 +1,7 @@ from spacy.errors import AlignmentError -from spacy.gold import ( - biluo_tags_from_offsets, - offsets_from_biluo_tags, - Example, - DocAnnotation, -) -from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo -from spacy.gold import GoldCorpus, docs_to_json, align +from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags +from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align +from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree from spacy.tokens import Doc @@ -20,6 +15,30 @@ import srsly def doc(): text = "Sarah's sister flew to Silicon Valley via London." tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = [ + "PROPN", + "PART", + "NOUN", + "VERB", + "ADP", + "PROPN", + "PROPN", + "ADP", + "PROPN", + "PUNCT", + ] + morphs = [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + "", + "NounType=prop|Number=sing", + "NounType=prop|Number=sing", + "", + "NounType=prop|Number=sing", + "PunctType=peri", + ] # head of '.' is intentionally nonprojective for testing heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] deps = [ @@ -52,9 +71,11 @@ def doc(): doc = nlp(text) for i in range(len(tags)): doc[i].tag_ = tags[i] + doc[i].pos_ = pos[i] + doc[i].morph_ = morphs[i] + doc[i].lemma_ = lemmas[i] doc[i].dep_ = deps[i] doc[i].head = doc[heads[i]] - doc[i].lemma_ = lemmas[i] doc.ents = spans_from_biluo_tags(doc, biluo_tags) doc.cats = cats doc.is_tagged = True @@ -162,9 +183,11 @@ def test_roundtrip_docs_to_json(doc): nlp = English() text = doc.text tags = [t.tag_ for t in doc] + pos = [t.pos_ for t in doc] + morphs = [t.morph_ for t in doc] + lemmas = [t.lemma_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] - lemmas = [t.lemma_ for t in doc] biluo_tags = iob_to_biluo( [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc] ) @@ -182,9 +205,11 @@ def test_roundtrip_docs_to_json(doc): assert len(doc) == goldcorpus.count_train() assert text == reloaded_example.text assert tags == goldparse.tags + assert pos == goldparse.pos + assert morphs == goldparse.morphs + assert lemmas == goldparse.lemmas assert deps == goldparse.labels assert heads == goldparse.heads - assert lemmas == goldparse.lemmas assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats @@ -203,9 +228,11 @@ def test_roundtrip_docs_to_json(doc): assert len(doc) == goldcorpus.count_train() assert text == reloaded_example.text assert tags == goldparse.tags + assert pos == goldparse.pos + assert morphs == goldparse.morphs + assert lemmas == goldparse.lemmas assert deps == goldparse.labels assert heads == goldparse.heads - assert lemmas == goldparse.lemmas assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats From 569cc9898200772cc894b1663eb03e9fd017c1c9 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Jan 2020 17:06:46 +0100 Subject: [PATCH 031/187] Update spaCy for thinc 8.0.0 (#4920) * Add load_from_config function * Add train_from_config script * Merge configs and expose via spacy.config * Fix script * Suggest create_evaluation_callback * Hard-code for NER * Fix errors * Register command * Add TODO * Update train-from-config todos * Fix imports * Allow delayed setting of parser model nr_class * Get train-from-config working * Tidy up and fix scores and printing * Hide traceback if cancelled * Fix weighted score formatting * Fix score formatting * Make output_path optional * Add Tok2Vec component * Tidy up and add tok2vec_tensors * Add option to copy docs in nlp.update * Copy docs in nlp.update * Adjust nlp.update() for set_annotations * Don't shuffle pipes in nlp.update, decruft * Support set_annotations arg in component update * Support set_annotations in parser update * Add get_gradients method * Add get_gradients to parser * Update errors.py * Fix problems caused by merge * Add _link_components method in nlp * Add concept of 'listeners' and ControlledModel * Support optional attributes arg in ControlledModel * Try having tok2vec component in pipeline * Fix tok2vec component * Fix config * Fix tok2vec * Update for Example * Update for Example * Update config * Add eg2doc util * Update and add schemas/types * Update schemas * Fix nlp.update * Fix tagger * Remove hacks from train-from-config * Remove hard-coded config str * Calculate loss in tok2vec component * Tidy up and use function signatures instead of models * Support union types for registry models * Minor cleaning in Language.update * Make ControlledModel specifically Tok2VecListener * Fix train_from_config * Fix tok2vec * Tidy up * Add function for bilstm tok2vec * Fix type * Fix syntax * Fix pytorch optimizer * Add example configs * Update for thinc describe changes * Update for Thinc changes * Update for dropout/sgd changes * Update for dropout/sgd changes * Unhack gradient update * Work on refactoring _ml * Remove _ml.py module * WIP upgrade cli scripts for thinc * Move some _ml stuff to util * Import link_vectors from util * Update train_from_config * Import from util * Import from util * Temporarily add ml.component_models module * Move ml methods * Move typedefs * Update load vectors * Update gitignore * Move imports * Add PrecomputableAffine * Fix imports * Fix imports * Fix imports * Fix missing imports * Update CLI scripts * Update spacy.language * Add stubs for building the models * Update model definition * Update create_default_optimizer * Fix import * Fix comment * Update imports in tests * Update imports in spacy.cli * Fix import * fix obsolete thinc imports * update srsly pin * from thinc to ml_datasets for example data such as imdb * update ml_datasets pin * using STATE.vectors * small fix * fix Sentencizer.pipe * black formatting * rename Affine to Linear as in thinc * set validate explicitely to True * rename with_square_sequences to with_list2padded * rename with_flatten to with_list2array * chaining layernorm * small fixes * revert Optimizer import * build_nel_encoder with new thinc style * fixes using model's get and set methods * Tok2Vec in component models, various fixes * fix up legacy tok2vec code * add model initialize calls * add in build_tagger_model * small fixes * setting model dims * fixes for ParserModel * various small fixes * initialize thinc Models * fixes * consistent naming of window_size * fixes, removing set_dropout * work around Iterable issue * remove legacy tok2vec * util fix * fix forward function of tok2vec listener * more fixes * trying to fix PrecomputableAffine (not succesful yet) * alloc instead of allocate * add morphologizer * rename residual * rename fixes * Fix predict function * Update parser and parser model * fixing few more tests * Fix precomputable affine * Update component model * Update parser model * Move backprop padding to own function, for test * Update test * Fix p. affine * Update NEL * build_bow_text_classifier and extract_ngrams * Fix parser init * Fix test add label * add build_simple_cnn_text_classifier * Fix parser init * Set gpu off by default in example * Fix tok2vec listener * Fix parser model * Small fixes * small fix for PyTorchLSTM parameters * revert my_compounding hack (iterable fixed now) * fix biLSTM * Fix uniqued * PyTorchRNNWrapper fix * small fixes * use helper function to calculate cosine loss * small fixes for build_simple_cnn_text_classifier * putting dropout default at 0.0 to ensure the layer gets built * using thinc util's set_dropout_rate * moving layer normalization inside of maxout definition to optimize dropout * temp debugging in NEL * fixed NEL model by using init defaults ! * fixing after set_dropout_rate refactor * proper fix * fix test_update_doc after refactoring optimizers in thinc * Add CharacterEmbed layer * Construct tagger Model * Add missing import * Remove unused stuff * Work on textcat * fix test (again :)) after optimizer refactor * fixes to allow reading Tagger from_disk without overwriting dimensions * don't build the tok2vec prematuraly * fix CharachterEmbed init * CharacterEmbed fixes * Fix CharacterEmbed architecture * fix imports * renames from latest thinc update * one more rename * add initialize calls where appropriate * fix parser initialization * Update Thinc version * Fix errors, auto-format and tidy up imports * Fix validation * fix if bias is cupy array * revert for now * ensure it's a numpy array before running bp in ParserStepModel * no reason to call require_gpu twice * use CupyOps.to_numpy instead of cupy directly * fix initialize of ParserModel * remove unnecessary import * fixes for CosineDistance * fix device renaming * use refactored loss functions (Thinc PR 251) * overfitting test for tagger * experimental settings for the tagger: avoid zero-init and subword normalization * clean up tagger overfitting test * use previous default value for nP * remove toy config * bringing layernorm back (had a bug - fixed in thinc) * revert setting nP explicitly * remove setting default in constructor * restore values as they used to be * add overfitting test for NER * add overfitting test for dep parser * add overfitting test for textcat * fixing init for linear (previously affine) * larger eps window for textcat * ensure doc is not None * Require newer thinc * Make float check vaguer * Slop the textcat overfit test more * Fix textcat test * Fix exclusive classes for textcat * fix after renaming of alloc methods * fixing renames and mandatory arguments (staticvectors WIP) * upgrade to thinc==8.0.0.dev3 * refer to vocab.vectors directly instead of its name * rename alpha to learn_rate * adding hashembed and staticvectors dropout * upgrade to thinc 8.0.0.dev4 * add name back to avoid warning W020 * thinc dev4 * update srsly * using thinc 8.0.0a0 ! Co-authored-by: Matthew Honnibal Co-authored-by: Ines Montani --- .gitignore | 4 + bin/wiki_entity_linking/train_descriptions.py | 29 +- .../wikidata_train_entity_linker.py | 2 +- examples/deep_learning_keras.py | 5 +- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 63 ++ .../ptb-joint-pos-dep/defaults.cfg | 65 ++ examples/pipeline/multi_processing.py | 5 +- examples/training/pretrain_textcat.py | 24 +- examples/training/rehearsal.py | 2 +- examples/training/train_entity_linker.py | 7 +- examples/training/train_textcat.py | 5 +- requirements.txt | 7 +- setup.cfg | 6 +- spacy/__init__.py | 5 +- spacy/__main__.py | 2 + spacy/_ml.py | 982 ------------------ spacy/cli/__init__.py | 1 + spacy/cli/pretrain.py | 33 +- spacy/cli/profile.py | 4 +- spacy/cli/train.py | 8 +- spacy/cli/train_from_config.py | 445 ++++++++ spacy/compat.py | 7 +- spacy/language.py | 75 +- spacy/lexeme.pyx | 2 +- spacy/ml/__init__.py | 2 - spacy/ml/_character_embed.py | 52 + spacy/ml/_layers.py | 165 +++ spacy/ml/_legacy_tok2vec.py | 129 --- spacy/ml/_wire.py | 41 - spacy/ml/common.py | 21 - spacy/ml/component_models.py | 222 ++++ spacy/ml/extract_ngrams.py | 39 + spacy/ml/tok2vec.py | 92 +- spacy/pipeline/__init__.py | 2 + spacy/pipeline/hooks.py | 14 +- spacy/pipeline/morphologizer.pyx | 25 +- spacy/pipeline/pipes.pyx | 324 ++++-- spacy/pipeline/tok2vec.py | 188 ++++ spacy/syntax/_beam_utils.pxd | 2 +- spacy/syntax/_beam_utils.pyx | 2 +- spacy/syntax/_parser_model.pxd | 2 +- spacy/syntax/_parser_model.pyx | 228 ++-- spacy/syntax/arc_eager.pxd | 2 +- spacy/syntax/ner.pyx | 2 +- spacy/syntax/nn_parser.pxd | 2 - spacy/syntax/nn_parser.pyx | 139 ++- spacy/syntax/transition_system.pxd | 3 +- spacy/syntax/transition_system.pyx | 2 +- spacy/tests/parser/test_add_label.py | 10 +- spacy/tests/parser/test_ner.py | 34 +- spacy/tests/parser/test_neural_parser.py | 13 +- spacy/tests/parser/test_parse.py | 42 + spacy/tests/parser/test_preset_sbd.py | 6 +- spacy/tests/pipeline/test_tagger.py | 33 + spacy/tests/pipeline/test_textcat.py | 28 + spacy/tests/regression/test_issue2501-3000.py | 2 +- spacy/tests/regression/test_issue3611.py | 2 +- spacy/tests/test_architectures.py | 4 +- spacy/tests/test_misc.py | 31 +- spacy/tests/test_tok2vec.py | 6 +- spacy/tests/vocab_vectors/test_vectors.py | 5 +- spacy/tokens/_retokenize.pyx | 2 +- spacy/tokens/_serialize.py | 2 +- spacy/tokens/doc.pyx | 2 +- spacy/tokens/span.pyx | 2 +- spacy/tokens/token.pyx | 2 +- spacy/typedefs.pxd | 2 + spacy/util.py | 88 +- spacy/vectors.pyx | 8 +- spacy/vocab.pyx | 4 +- 70 files changed, 2141 insertions(+), 1675 deletions(-) create mode 100644 examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg create mode 100644 examples/experiments/ptb-joint-pos-dep/defaults.cfg create mode 100644 spacy/cli/train_from_config.py create mode 100644 spacy/ml/_character_embed.py create mode 100644 spacy/ml/_layers.py delete mode 100644 spacy/ml/_legacy_tok2vec.py delete mode 100644 spacy/ml/_wire.py delete mode 100644 spacy/ml/common.py create mode 100644 spacy/ml/component_models.py create mode 100644 spacy/ml/extract_ngrams.py create mode 100644 spacy/pipeline/tok2vec.py diff --git a/.gitignore b/.gitignore index c4ad59fc7..a0af6d4d2 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ __pycache__/ .env* .~env/ .venv +env3.6/ venv/ .dev .denv @@ -111,3 +112,6 @@ Desktop.ini # Pycharm project files *.idea + +# IPython +.ipynb_checkpoints/ diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index af08d6b8f..d98bba565 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -4,12 +4,12 @@ from random import shuffle import logging import numpy as np -from spacy._ml import zero_init, create_default_optimizer -from spacy.cli.pretrain import get_cossim_loss - -from thinc.v2v import Model +from thinc.model import Model from thinc.api import chain -from thinc.neural._classes.affine import Affine +from thinc.loss import CosineDistance +from thinc.layers import Linear + +from spacy.util import create_default_optimizer logger = logging.getLogger(__name__) @@ -34,6 +34,7 @@ class EntityEncoder: self.input_dim = input_dim self.desc_width = desc_width self.epochs = epochs + self.distance = CosineDistance(ignore_zeros=True, normalize=False) def apply_encoder(self, description_list): if self.encoder is None: @@ -132,21 +133,17 @@ class EntityEncoder: def _build_network(self, orig_width, hidden_with): with Model.define_operators({">>": chain}): # very simple encoder-decoder model - self.encoder = Affine(hidden_with, orig_width) - self.model = self.encoder >> zero_init( - Affine(orig_width, hidden_with, drop_factor=0.0) - ) - self.sgd = create_default_optimizer(self.model.ops) + self.encoder = Linear(hidden_with, orig_width) + # TODO: removed the zero_init here - is oK? + self.model = self.encoder >> Linear(orig_width, hidden_with) + self.sgd = create_default_optimizer() def _update(self, vectors): + truths = self.model.ops.asarray(vectors) predictions, bp_model = self.model.begin_update( - np.asarray(vectors), drop=self.DROP + truths, drop=self.DROP ) - loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) + d_scores, loss = self.distance(predictions, truths) bp_model(d_scores, sgd=self.sgd) return loss / len(vectors) - @staticmethod - def _get_loss(golds, scores): - loss, gradients = get_cossim_loss(scores, golds) - return loss, gradients diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py index 6b5f4c30d..f4a1b321d 100644 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py @@ -103,7 +103,7 @@ def main( logger.info("STEP 3: Creating and training an Entity Linking pipe") el_pipe = nlp.create_pipe( - name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name, + name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors, "labels_discard": labels_discard} ) el_pipe.set_kb(kb) diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py index 049cc0be4..bf857b8b7 100644 --- a/examples/deep_learning_keras.py +++ b/examples/deep_learning_keras.py @@ -14,7 +14,7 @@ pip install keras==2.0.9 Compatible with: spaCy v2.0.0+ """ - +import ml_datasets import plac import random import pathlib @@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json from keras.layers import LSTM, Dense, Embedding, Bidirectional from keras.layers import TimeDistributed from keras.optimizers import Adam -import thinc.extra.datasets from spacy.compat import pickle import spacy @@ -224,7 +223,7 @@ def main( if model_dir is not None: model_dir = pathlib.Path(model_dir) if train_dir is None or dev_dir is None: - imdb_data = thinc.extra.datasets.imdb() + imdb_data = ml_datasets.imdb() if is_runtime: if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg new file mode 100644 index 000000000..8cd150868 --- /dev/null +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -0,0 +1,63 @@ +[training] +patience = 10000 +eval_frequency = 200 +dropout = 0.2 +init_tok2vec = null +vectors = null +max_epochs = 100 +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +use_gpu = 0 +scores = ["tags_acc", "uas", "las"] +score_weights = {"las": 0.8, "tags_acc": 0.2} +limit = 0 + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.tagger.model] +@architectures = "tagger_model.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "transition_based_parser.v1" +nr_feature_tokens = 8 +hidden_width = 64 +maxout_pieces = 3 + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "hash_embed_bilstm.v1" +pretrained_vectors = ${nlp:vectors} +width = 96 +depth = 4 +embed_size = 2000 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg new file mode 100644 index 000000000..6735284a7 --- /dev/null +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -0,0 +1,65 @@ +[training] +patience = 10000 +eval_frequency = 200 +dropout = 0.2 +init_tok2vec = null +vectors = null +max_epochs = 100 +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +use_gpu = -1 +scores = ["tags_acc", "uas", "las"] +score_weights = {"las": 0.8, "tags_acc": 0.2} +limit = 0 + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.tagger.model] +@architectures = "tagger_model.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "transition_based_parser.v1" +nr_feature_tokens = 8 +hidden_width = 64 +maxout_pieces = 3 + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "hash_embed_cnn.v1" +pretrained_vectors = ${nlp:vectors} +width = 96 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py index f0e437acf..e4aca7912 100644 --- a/examples/pipeline/multi_processing.py +++ b/examples/pipeline/multi_processing.py @@ -13,9 +13,10 @@ Prerequisites: pip install joblib from __future__ import print_function, unicode_literals from pathlib import Path + +import ml_datasets from joblib import Parallel, delayed from functools import partial -import thinc.extra.datasets import plac import spacy from spacy.util import minibatch @@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10 output_dir.mkdir() # load and pre-process the IMBD dataset print("Loading IMDB data...") - data, _ = thinc.extra.datasets.imdb() + data, _ = ml_datasets.imdb() texts, _ = zip(*data[-limit:]) print("Processing texts...") partitions = minibatch(texts, size=batch_size) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index 64f7002ef..f1cc2d3d2 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -16,16 +16,18 @@ the development labels, after all --- only the unlabelled text. import plac import tqdm import random + +import ml_datasets + import spacy -import thinc.extra.datasets from spacy.util import minibatch, use_gpu, compounding -from spacy._ml import Tok2Vec from spacy.pipeline import TextCategorizer +from spacy.ml.tok2vec import Tok2Vec import numpy def load_texts(limit=0): - train, dev = thinc.extra.datasets.imdb() + train, dev = ml_datasets.imdb() train_texts, train_labels = zip(*train) dev_texts, dev_labels = zip(*train) train_texts = list(train_texts) @@ -41,7 +43,7 @@ def load_texts(limit=0): def load_textcat_data(limit=0): """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation - train_data, eval_data = thinc.extra.datasets.imdb() + train_data, eval_data = ml_datasets.imdb() random.shuffle(train_data) train_data = train_data[-limit:] texts, labels = zip(*train_data) @@ -63,17 +65,15 @@ def prefer_gpu(): def build_textcat_model(tok2vec, nr_class, width): - from thinc.v2v import Model, Softmax, Maxout - from thinc.api import flatten_add_lengths, chain - from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool - from thinc.misc import Residual, LayerNorm - from spacy._ml import logistic, zero_init + from thinc.model import Model + from thinc.layers import Softmax, chain, reduce_mean + from thinc.layers import list2ragged with Model.define_operators({">>": chain}): model = ( tok2vec - >> flatten_add_lengths - >> Pooling(mean_pool) + >> list2ragged() + >> reduce_mean() >> Softmax(nr_class, width) ) model.tok2vec = tok2vec @@ -81,7 +81,7 @@ def build_textcat_model(tok2vec, nr_class, width): def block_gradients(model): - from thinc.api import wrap + from thinc.api import wrap # TODO FIX def forward(X, drop=0.0): Y, _ = model.begin_update(X, drop=drop) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index b08ba9f9a..98459cf03 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -58,7 +58,7 @@ def main(model_name, unlabelled_loc): # yet, but I'm getting weird results from Adam. Try commenting out the # nlp.update(), and using Adam -- you'll find the models drift apart. # I guess Adam is losing precision, introducing gradient noise? - optimizer.alpha = 0.1 + optimizer.learn_rate = 0.1 optimizer.b1 = 0.0 optimizer.b2 = 0.0 diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index f44c3b9cc..6e19848d3 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -17,7 +17,7 @@ import plac import random from pathlib import Path -from spacy.symbols import PERSON +import srsly from spacy.vocab import Vocab import spacy @@ -68,7 +68,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) - nlp.vocab.vectors.name = "spacy_pretrained_vectors" + nlp.vocab.vectors.name = "nel_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. @@ -93,7 +93,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. - # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. + # Also ensure that the annotated examples correspond to known identifiers in the knowledge base. kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: @@ -117,6 +117,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() + for itn in range(n_iter): random.shuffle(TRAIN_DOCS) losses = {} diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 128773c0a..683ab1fc6 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -10,10 +10,11 @@ see the documentation: Compatible with: spaCy v2.0.0+ """ from __future__ import unicode_literals, print_function + +import ml_datasets import plac import random from pathlib import Path -import thinc.extra.datasets import spacy from spacy.util import minibatch, compounding @@ -115,7 +116,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None def load_data(limit=0, split=0.8): """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation - train_data, _ = thinc.extra.datasets.imdb() + train_data, _ = ml_datasets.imdb() random.shuffle(train_data) train_data = train_data[-limit:] texts, labels = zip(*train_data) diff --git a/requirements.txt b/requirements.txt index 79a05b2bd..bb6bf9804 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,20 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0.dev0 +thinc==8.0.0a0 blis>=0.4.0,<0.5.0 +ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 -srsly>=0.1.0,<1.1.0 +srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 +# Optional dependencies +jsonschema>=2.6.0,<3.1.0 pydantic>=1.0.0,<2.0.0 # Development dependencies cython>=0.25 diff --git a/setup.cfg b/setup.cfg index 9516a3dda..9ea85e896 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,16 +35,16 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0.dev0 + thinc==8.0.0a0 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0.dev0 + thinc==8.0.0a0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 - srsly>=0.1.0,<1.1.0 + srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 # Third-party dependencies setuptools diff --git a/spacy/__init__.py b/spacy/__init__.py index 49db0e3b5..4a311ec86 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # These are imported as part of the API -from thinc.neural.util import prefer_gpu, require_gpu +from thinc.util import prefer_gpu, require_gpu from . import pipeline from .cli.info import info as cli_info @@ -21,6 +21,9 @@ if sys.maxunicode == 65535: raise SystemError(Errors.E130) +config = registry + + def load(name, **overrides): depr_path = overrides.get("path") if depr_path not in (True, False, None): diff --git a/spacy/__main__.py b/spacy/__main__.py index 05e3d5e02..71ab1a91a 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -4,12 +4,14 @@ if __name__ == "__main__": from wasabi import msg from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data + from spacy.cli import train_from_config_cli commands = { "download": download, "link": link, "info": info, "train": train, + "train-from-config": train_from_config_cli, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, diff --git a/spacy/_ml.py b/spacy/_ml.py index 37cfff0b7..e69de29bb 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,982 +0,0 @@ -import numpy -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu -from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.t2v import Pooling, sum_pool, mean_pool -from thinc.i2v import HashEmbed -from thinc.misc import Residual, FeatureExtracter -from thinc.misc import LayerNorm as LN -from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.api import with_getitem, flatten_add_lengths -from thinc.api import uniqued, wrap, noop -from thinc.linear.linear import LinearModel -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module, copy_array -from thinc.neural.optimizers import Adam - -from thinc import describe -from thinc.describe import Dimension, Synapses, Biases, Gradient -from thinc.neural._classes.affine import _set_dimensions_if_needed -import thinc.extra.load_nlp - -from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE -from .errors import Errors, user_warning, Warnings -from . import util -from . import ml as new_ml -from .ml import _legacy_tok2vec - - -VECTORS_KEY = "spacy_pretrained_vectors" -# Backwards compatibility with <2.2.2 -USE_MODEL_REGISTRY_TOK2VEC = False - - -def cosine(vec1, vec2): - xp = get_array_module(vec1) - norm1 = xp.linalg.norm(vec1) - norm2 = xp.linalg.norm(vec2) - if norm1 == 0.0 or norm2 == 0.0: - return 0 - else: - return vec1.dot(vec2) / (norm1 * norm2) - - -def create_default_optimizer(ops, **cfg): - learn_rate = util.env_opt("learn_rate", 0.001) - beta1 = util.env_opt("optimizer_B1", 0.9) - beta2 = util.env_opt("optimizer_B2", 0.999) - eps = util.env_opt("optimizer_eps", 1e-8) - L2 = util.env_opt("L2_penalty", 1e-6) - max_grad_norm = util.env_opt("grad_norm_clip", 1.0) - optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) - optimizer.max_grad_norm = max_grad_norm - optimizer.device = ops.device - return optimizer - - -@layerize -def _flatten_add_lengths(seqs, pad=0, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=pad) - - X = ops.flatten(seqs, pad=pad) - return (X, lengths), finish_update - - -def _zero_init(model): - def _zero_init_impl(self, *args, **kwargs): - self.W.fill(0) - - model.on_init_hooks.append(_zero_init_impl) - if model.W is not None: - model.W.fill(0.0) - return model - - -def with_cpu(ops, model): - """Wrap a model that should run on CPU, transferring inputs and outputs - as necessary.""" - model.to_cpu() - - def with_cpu_forward(inputs, drop=0.0): - cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop) - gpu_outputs = _to_device(ops, cpu_outputs) - - def with_cpu_backprop(d_outputs, sgd=None): - cpu_d_outputs = _to_cpu(d_outputs) - return backprop(cpu_d_outputs, sgd=sgd) - - return gpu_outputs, with_cpu_backprop - - return wrap(with_cpu_forward, model) - - -def _to_cpu(X): - if isinstance(X, numpy.ndarray): - return X - elif isinstance(X, tuple): - return tuple([_to_cpu(x) for x in X]) - elif isinstance(X, list): - return [_to_cpu(x) for x in X] - elif hasattr(X, "get"): - return X.get() - else: - return X - - -def _to_device(ops, X): - if isinstance(X, tuple): - return tuple([_to_device(ops, x) for x in X]) - elif isinstance(X, list): - return [_to_device(ops, x) for x in X] - else: - return ops.asarray(X) - - -class extract_ngrams(Model): - def __init__(self, ngram_size, attr=LOWER): - Model.__init__(self) - self.ngram_size = ngram_size - self.attr = attr - - def begin_update(self, docs, drop=0.0): - batch_keys = [] - batch_vals = [] - for doc in docs: - unigrams = doc.to_array([self.attr]) - ngrams = [unigrams] - for n in range(2, self.ngram_size + 1): - ngrams.append(self.ops.ngrams(n, unigrams)) - keys = self.ops.xp.concatenate(ngrams) - keys, vals = self.ops.xp.unique(keys, return_counts=True) - batch_keys.append(keys) - batch_vals.append(vals) - # The dtype here matches what thinc is expecting -- which differs per - # platform (by int definition). This should be fixed once the problem - # is fixed on Thinc's side. - lengths = self.ops.asarray( - [arr.shape[0] for arr in batch_keys], dtype=numpy.int_ - ) - batch_keys = self.ops.xp.concatenate(batch_keys) - batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") - return (batch_keys, batch_vals, lengths), None - - -@describe.on_data( - _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model) -) -@describe.attributes( - nI=Dimension("Input size"), - nF=Dimension("Number of features"), - nO=Dimension("Output size"), - nP=Dimension("Maxout pieces"), - W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), - b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)), - pad=Synapses( - "Pad", - lambda obj: (1, obj.nF, obj.nO, obj.nP), - lambda M, ops: ops.normal_init(M, 1.0), - ), - d_W=Gradient("W"), - d_pad=Gradient("pad"), - d_b=Gradient("b"), -) -class PrecomputableAffine(Model): - def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs): - Model.__init__(self, **kwargs) - self.nO = nO - self.nP = nP - self.nI = nI - self.nF = nF - - def begin_update(self, X, drop=0.0): - Yf = self.ops.gemm( - X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True - ) - Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP)) - Yf = self._add_padding(Yf) - - def backward(dY_ids, sgd=None): - dY, ids = dY_ids - dY, ids = self._backprop_padding(dY, ids) - Xf = X[ids] - Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI)) - - self.d_b += dY.sum(axis=0) - dY = dY.reshape((dY.shape[0], self.nO * self.nP)) - - Wopfi = self.W.transpose((1, 2, 0, 3)) - Wopfi = self.ops.xp.ascontiguousarray(Wopfi) - Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI)) - dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi) - - # Reuse the buffer - dWopfi = Wopfi - dWopfi.fill(0.0) - self.ops.gemm(dY, Xf, out=dWopfi, trans1=True) - dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI)) - # (o, p, f, i) --> (f, o, p, i) - self.d_W += dWopfi.transpose((2, 0, 1, 3)) - - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf.reshape((dXf.shape[0], self.nF, self.nI)) - - return Yf, backward - - def _add_padding(self, Yf): - Yf_padded = self.ops.xp.vstack((self.pad, Yf)) - return Yf_padded - - def _backprop_padding(self, dY, ids): - # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0 - mask = ids < 0.0 - mask = mask.sum(axis=1) - d_pad = dY * mask.reshape((ids.shape[0], 1, 1)) - self.d_pad += d_pad.sum(axis=0) - return dY, ids - - @staticmethod - def init_weights(model): - """This is like the 'layer sequential unit variance', but instead - of taking the actual inputs, we randomly generate whitened data. - - Why's this all so complicated? We have a huge number of inputs, - and the maxout unit makes guessing the dynamics tricky. Instead - we set the maxout weights to values that empirically result in - whitened outputs given whitened inputs. - """ - if (model.W ** 2).sum() != 0.0: - return - ops = model.ops - xp = ops.xp - ops.normal_init(model.W, model.nF * model.nI, inplace=True) - - ids = ops.allocate((5000, model.nF), dtype="f") - ids += xp.random.uniform(0, 1000, ids.shape) - ids = ops.asarray(ids, dtype="i") - tokvecs = ops.allocate((5000, model.nI), dtype="f") - tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( - tokvecs.shape - ) - - def predict(ids, tokvecs): - # nS ids. nW tokvecs. Exclude the padding array. - hiddens = model(tokvecs[:-1]) # (nW, f, o, p) - vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f") - # need nS vectors - hiddens = hiddens.reshape( - (hiddens.shape[0] * model.nF, model.nO * model.nP) - ) - model.ops.scatter_add(vectors, ids.flatten(), hiddens) - vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP)) - vectors += model.b - vectors = model.ops.asarray(vectors) - if model.nP >= 2: - return model.ops.maxout(vectors)[0] - else: - return vectors * (vectors >= 0) - - tol_var = 0.01 - tol_mean = 0.01 - t_max = 10 - t_i = 0 - for t_i in range(t_max): - acts1 = predict(ids, tokvecs) - var = model.ops.xp.var(acts1) - mean = model.ops.xp.mean(acts1) - if abs(var - 1.0) >= tol_var: - model.W /= model.ops.xp.sqrt(var) - elif abs(mean) >= tol_mean: - model.b -= mean - else: - break - - -def link_vectors_to_models(vocab): - vectors = vocab.vectors - if vectors.name is None: - vectors.name = VECTORS_KEY - if vectors.data.size != 0: - user_warning(Warnings.W020.format(shape=vectors.data.shape)) - ops = Model.ops - for word in vocab: - if word.orth in vectors.key2row: - word.rank = vectors.key2row[word.orth] - else: - word.rank = 0 - data = ops.asarray(vectors.data) - # Set an entry here, so that vectors are accessed by StaticVectors - # (unideal, I know) - key = (ops.device, vectors.name) - if key in thinc.extra.load_nlp.VECTORS: - if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: - # This is a hack to avoid the problem in #3853. Maybe we should - # print a warning as well? - old_name = vectors.name - new_name = f"{vectors.name}_{data.shape[0]}" - user_warning(Warnings.W019.format(old=old_name, new=new_name)) - vectors.name = new_name - key = (ops.device, vectors.name) - thinc.extra.load_nlp.VECTORS[key] = data - - -def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): - import torch.nn - from thinc.api import with_square_sequences - from thinc.extra.wrappers import PyTorchWrapperRNN - - if depth == 0: - return layerize(noop()) - model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout) - return with_square_sequences(PyTorchWrapperRNN(model)) - - -def Tok2Vec(width, embed_size, **kwargs): - if not USE_MODEL_REGISTRY_TOK2VEC: - # Preserve prior tok2vec for backwards compat, in v2.2.2 - return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs) - pretrained_vectors = kwargs.get("pretrained_vectors", None) - cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) - subword_features = kwargs.get("subword_features", True) - char_embed = kwargs.get("char_embed", False) - conv_depth = kwargs.get("conv_depth", 4) - bilstm_depth = kwargs.get("bilstm_depth", 0) - conv_window = kwargs.get("conv_window", 1) - - cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] - - doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}} - if char_embed: - embed_cfg = { - "arch": "spacy.CharacterEmbed.v1", - "config": { - "width": 64, - "chars": 6, - "@mix": { - "arch": "spacy.LayerNormalizedMaxout.v1", - "config": {"width": width, "pieces": 3}, - }, - "@embed_features": None, - }, - } - else: - embed_cfg = { - "arch": "spacy.MultiHashEmbed.v1", - "config": { - "width": width, - "rows": embed_size, - "columns": cols, - "use_subwords": subword_features, - "@pretrained_vectors": None, - "@mix": { - "arch": "spacy.LayerNormalizedMaxout.v1", - "config": {"width": width, "pieces": 3}, - }, - }, - } - if pretrained_vectors: - embed_cfg["config"]["@pretrained_vectors"] = { - "arch": "spacy.PretrainedVectors.v1", - "config": { - "vectors_name": pretrained_vectors, - "width": width, - "column": cols.index("ID"), - }, - } - if cnn_maxout_pieces >= 2: - cnn_cfg = { - "arch": "spacy.MaxoutWindowEncoder.v1", - "config": { - "width": width, - "window_size": conv_window, - "pieces": cnn_maxout_pieces, - "depth": conv_depth, - }, - } - else: - cnn_cfg = { - "arch": "spacy.MishWindowEncoder.v1", - "config": {"width": width, "window_size": conv_window, "depth": conv_depth}, - } - bilstm_cfg = { - "arch": "spacy.TorchBiLSTMEncoder.v1", - "config": {"width": width, "depth": bilstm_depth}, - } - if conv_depth == 0 and bilstm_depth == 0: - encode_cfg = {} - elif conv_depth >= 1 and bilstm_depth >= 1: - encode_cfg = { - "arch": "thinc.FeedForward.v1", - "config": {"children": [cnn_cfg, bilstm_cfg]}, - } - elif conv_depth >= 1: - encode_cfg = cnn_cfg - else: - encode_cfg = bilstm_cfg - config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg} - return new_ml.Tok2Vec(config) - - -def reapply(layer, n_times): - def reapply_fwd(X, drop=0.0): - backprops = [] - for i in range(n_times): - Y, backprop = layer.begin_update(X, drop=drop) - X = Y - backprops.append(backprop) - - def reapply_bwd(dY, sgd=None): - dX = None - for backprop in reversed(backprops): - dY = backprop(dY, sgd=sgd) - if dX is None: - dX = dY - else: - dX += dY - return dX - - return Y, reapply_bwd - - return wrap(reapply_fwd, layer) - - -def asarray(ops, dtype): - def forward(X, drop=0.0): - return ops.asarray(X, dtype=dtype), None - - return layerize(forward) - - -def _divide_array(X, size): - parts = [] - index = 0 - while index < len(X): - parts.append(X[index : index + size]) - index += size - return parts - - -def get_col(idx): - if idx < 0: - raise IndexError(Errors.E066.format(value=idx)) - - def forward(X, drop=0.0): - if isinstance(X, numpy.ndarray): - ops = NumpyOps() - else: - ops = CupyOps() - output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) - - def backward(y, sgd=None): - dX = ops.allocate(X.shape) - dX[:, idx] += y - return dX - - return output, backward - - return layerize(forward) - - -def doc2feats(cols=None): - if cols is None: - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - - def forward(docs, drop=0.0): - feats = [] - for doc in docs: - feats.append(doc.to_array(cols)) - return feats, None - - model = layerize(forward) - model.cols = cols - return model - - -def print_shape(prefix): - def forward(X, drop=0.0): - return X, lambda dX, **kwargs: dX - - return layerize(forward) - - -@layerize -def get_token_vectors(tokens_attrs_vectors, drop=0.0): - tokens, attrs, vectors = tokens_attrs_vectors - - def backward(d_output, sgd=None): - return (tokens, d_output) - - return vectors, backward - - -@layerize -def logistic(X, drop=0.0): - xp = get_array_module(X) - if not isinstance(X, xp.ndarray): - X = xp.asarray(X) - # Clip to range (-10, 10) - X = xp.minimum(X, 10.0, X) - X = xp.maximum(X, -10.0, X) - Y = 1.0 / (1.0 + xp.exp(-X)) - - def logistic_bwd(dY, sgd=None): - dX = dY * (Y * (1 - Y)) - return dX - - return Y, logistic_bwd - - -def zero_init(model): - def _zero_init_impl(self, X, y): - self.W.fill(0) - - model.on_data_hooks.append(_zero_init_impl) - return model - - -def getitem(i): - def getitem_fwd(X, drop=0.0): - return X[i], None - - return layerize(getitem_fwd) - - -@describe.attributes( - W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None) -) -class MultiSoftmax(Affine): - """Neural network layer that predicts several multi-class attributes at once. - For instance, we might predict one class with 6 variables, and another with 5. - We predict the 11 neurons required for this, and then softmax them such - that columns 0-6 make a probability distribution and coumns 6-11 make another. - """ - - name = "multisoftmax" - - def __init__(self, out_sizes, nI=None, **kwargs): - Model.__init__(self, **kwargs) - self.out_sizes = out_sizes - self.nO = sum(out_sizes) - self.nI = nI - - def predict(self, input__BI): - output__BO = self.ops.affine(self.W, self.b, input__BI) - i = 0 - for out_size in self.out_sizes: - self.ops.softmax(output__BO[:, i : i + out_size], inplace=True) - i += out_size - return output__BO - - def begin_update(self, input__BI, drop=0.0): - output__BO = self.predict(input__BI) - - def finish_update(grad__BO, sgd=None): - self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True) - self.d_b += grad__BO.sum(axis=0) - grad__BI = self.ops.gemm(grad__BO, self.W) - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return grad__BI - - return output__BO, finish_update - - -def build_tagger_model(nr_class, **cfg): - embed_size = util.env_opt("embed_size", 2000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 96) - pretrained_vectors = cfg.get("pretrained_vectors") - subword_features = cfg.get("subword_features", True) - with Model.define_operators({">>": chain, "+": add}): - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - tok2vec = Tok2Vec( - token_vector_width, - embed_size, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors, - ) - softmax = with_flatten(Softmax(nr_class, token_vector_width)) - model = tok2vec >> softmax - model.nI = None - model.tok2vec = tok2vec - model.softmax = softmax - return model - - -def build_morphologizer_model(class_nums, **cfg): - embed_size = util.env_opt("embed_size", 7000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 128) - pretrained_vectors = cfg.get("pretrained_vectors") - char_embed = cfg.get("char_embed", True) - with Model.define_operators({">>": chain, "+": add, "**": clone}): - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - tok2vec = Tok2Vec( - token_vector_width, - embed_size, - char_embed=char_embed, - pretrained_vectors=pretrained_vectors, - ) - softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) - softmax.out_sizes = class_nums - model = tok2vec >> softmax - model.nI = None - model.tok2vec = tok2vec - model.softmax = softmax - return model - - -@layerize -def SpacyVectors(docs, drop=0.0): - batch = [] - for doc in docs: - indices = numpy.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in doc.vocab.vectors.key2row: - indices[i] = doc.vocab.vectors.key2row[word.orth] - else: - indices[i] = 0 - vectors = doc.vocab.vectors.data[indices] - batch.append(vectors) - return batch, None - - -def build_text_classifier(nr_class, width=64, **cfg): - depth = cfg.get("depth", 2) - nr_vector = cfg.get("nr_vector", 5000) - pretrained_dims = cfg.get("pretrained_dims", 0) - with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}): - if cfg.get("low_data") and pretrained_dims: - model = ( - SpacyVectors - >> flatten_add_lengths - >> with_getitem(0, Affine(width, pretrained_dims)) - >> ParametricAttention(width) - >> Pooling(sum_pool) - >> Residual(ReLu(width, width)) ** 2 - >> zero_init(Affine(nr_class, width, drop_factor=0.0)) - >> logistic - ) - return model - - lower = HashEmbed(width, nr_vector, column=1) - prefix = HashEmbed(width // 2, nr_vector, column=2) - suffix = HashEmbed(width // 2, nr_vector, column=3) - shape = HashEmbed(width // 2, nr_vector, column=4) - - trained_vectors = FeatureExtracter( - [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] - ) >> with_flatten( - uniqued( - (lower | prefix | suffix | shape) - >> LN(Maxout(width, width + (width // 2) * 3)), - column=0, - ) - ) - - if pretrained_dims: - static_vectors = SpacyVectors >> with_flatten( - Affine(width, pretrained_dims) - ) - # TODO Make concatenate support lists - vectors = concatenate_lists(trained_vectors, static_vectors) - vectors_width = width * 2 - else: - vectors = trained_vectors - vectors_width = width - static_vectors = None - tok2vec = vectors >> with_flatten( - LN(Maxout(width, vectors_width)) - >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth, - pad=depth, - ) - cnn_model = ( - tok2vec - >> flatten_add_lengths - >> ParametricAttention(width) - >> Pooling(sum_pool) - >> Residual(zero_init(Maxout(width, width))) - >> zero_init(Affine(nr_class, width, drop_factor=0.0)) - ) - - linear_model = build_bow_text_classifier( - nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False - ) - if cfg.get("exclusive_classes"): - output_layer = Softmax(nr_class, nr_class * 2) - else: - output_layer = ( - zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic - ) - model = (linear_model | cnn_model) >> output_layer - model.tok2vec = chain(tok2vec, flatten) - model.nO = nr_class - model.lsuv = False - return model - - -def build_bow_text_classifier( - nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg -): - with Model.define_operators({">>": chain}): - model = with_cpu( - Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class) - ) - if not no_output_layer: - model = model >> (cpu_softmax if exclusive_classes else logistic) - model.nO = nr_class - return model - - -@layerize -def cpu_softmax(X, drop=0.0): - ops = NumpyOps() - - def cpu_softmax_backward(dY, sgd=None): - return dY - - return ops.softmax(X), cpu_softmax_backward - - -def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg): - """ - Build a simple CNN text classifier, given a token-to-vector model as inputs. - If exclusive_classes=True, a softmax non-linearity is applied, so that the - outputs sum to 1. If exclusive_classes=False, a logistic non-linearity - is applied instead, so that outputs are in the range [0, 1]. - """ - with Model.define_operators({">>": chain}): - if exclusive_classes: - output_layer = Softmax(nr_class, tok2vec.nO) - else: - output_layer = ( - zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic - ) - model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer - model.tok2vec = chain(tok2vec, flatten) - model.nO = nr_class - return model - - -def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): - if "entity_width" not in cfg: - raise ValueError(Errors.E144.format(param="entity_width")) - - conv_depth = cfg.get("conv_depth", 2) - cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) - pretrained_vectors = cfg.get("pretrained_vectors", None) - context_width = cfg.get("entity_width") - - with Model.define_operators({">>": chain, "**": clone}): - # context encoder - tok2vec = Tok2Vec( - width=hidden_width, - embed_size=embed_width, - pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, - subword_features=True, - conv_depth=conv_depth, - bilstm_depth=0, - ) - - model = ( - tok2vec - >> flatten_add_lengths - >> Pooling(mean_pool) - >> Residual(zero_init(Maxout(hidden_width, hidden_width))) - >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0)) - ) - - model.tok2vec = tok2vec - model.nO = context_width - return model - - -@layerize -def flatten(seqs, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=0) - - X = ops.flatten(seqs, pad=0) - return X, finish_update - - -def concatenate_lists(*layers, **kwargs): # pragma: no cover - """Compose two or more models `f`, `g`, etc, such that their outputs are - concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - """ - if not layers: - return noop() - drop_factor = kwargs.get("drop_factor", 1.0) - ops = layers[0].ops - layers = [chain(layer, flatten) for layer in layers] - concat = concatenate(*layers) - - def concatenate_lists_fwd(Xs, drop=0.0): - if drop is not None: - drop *= drop_factor - lengths = ops.asarray([len(X) for X in Xs], dtype="i") - flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) - ys = ops.unflatten(flat_y, lengths) - - def concatenate_lists_bwd(d_ys, sgd=None): - return bp_flat_y(ops.flatten(d_ys), sgd=sgd) - - return ys, concatenate_lists_bwd - - model = wrap(concatenate_lists_fwd, concat) - return model - - -def masked_language_model(vocab, model, mask_prob=0.15): - """Convert a model into a BERT-style masked language model""" - - random_words = _RandomWords(vocab) - - def mlm_forward(docs, drop=0.0): - mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) - mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.begin_update(docs, drop=drop) - - def mlm_backward(d_output, sgd=None): - d_output *= 1 - mask - return backprop(d_output, sgd=sgd) - - return output, mlm_backward - - return wrap(mlm_forward, model) - - -class _RandomWords(object): - def __init__(self, vocab): - self.words = [lex.text for lex in vocab if lex.prob != 0.0] - self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] - self.words = self.words[:10000] - self.probs = self.probs[:10000] - self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) - self.probs /= self.probs.sum() - self._cache = [] - - def next(self): - if not self._cache: - self._cache.extend( - numpy.random.choice(len(self.words), 10000, p=self.probs) - ) - index = self._cache.pop() - return self.words[index] - - -def _apply_mask(docs, random_words, mask_prob=0.15): - # This needs to be here to avoid circular imports - from .tokens.doc import Doc - - N = sum(len(doc) for doc in docs) - mask = numpy.random.uniform(0.0, 1.0, (N,)) - mask = mask >= mask_prob - i = 0 - masked_docs = [] - for doc in docs: - words = [] - for token in doc: - if not mask[i]: - word = _replace_word(token.text, random_words) - else: - word = token.text - words.append(word) - i += 1 - spaces = [bool(w.whitespace_) for w in doc] - # NB: If you change this implementation to instead modify - # the docs in place, take care that the IDs reflect the original - # words. Currently we use the original docs to make the vectors - # for the target, so we don't lose the original tokens. But if - # you modified the docs in place here, you would. - masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) - return mask, masked_docs - - -def _replace_word(word, random_words, mask="[MASK]"): - roll = numpy.random.random() - if roll < 0.8: - return mask - elif roll < 0.9: - return random_words.next() - else: - return word - - -def _uniform_init(lo, hi): - def wrapped(W, ops): - copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) - - return wrapped - - -@describe.attributes( - nM=Dimension("Vector dimensions"), - nC=Dimension("Number of characters per word"), - vectors=Synapses( - "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1) - ), - d_vectors=Gradient("vectors"), -) -class CharacterEmbed(Model): - def __init__(self, nM=None, nC=None, **kwargs): - Model.__init__(self, **kwargs) - self.nM = nM - self.nC = nC - - @property - def nO(self): - return self.nM * self.nC - - @property - def nV(self): - return 256 - - def begin_update(self, docs, drop=0.0): - if not docs: - return [] - ids = [] - output = [] - weights = self.vectors - # This assists in indexing; it's like looping over this dimension. - # Still consider this weird witch craft...But thanks to Mark Neumann - # for the tip. - nCv = self.ops.xp.arange(self.nC) - for doc in docs: - doc_ids = doc.to_utf8_array(nr_char=self.nC) - doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM)) - # Let's say I have a 2d array of indices, and a 3d table of data. What numpy - # incantation do I chant to get - # output[i, j, k] == data[j, ids[i, j], k]? - doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]] - output.append(doc_vectors.reshape((len(doc), self.nO))) - ids.append(doc_ids) - - def backprop_character_embed(d_vectors, sgd=None): - gradient = self.d_vectors - for doc_ids, d_doc_vectors in zip(ids, d_vectors): - d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM)) - gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return None - - return output, backprop_character_embed - - -def get_cossim_loss(yh, y, ignore_zeros=False): - xp = get_array_module(yh) - # Find the zero vectors - if ignore_zeros: - zero_indices = xp.abs(y).sum(axis=1) == 0 - # Add a small constant to avoid 0 vectors - yh = yh + 1e-8 - y = y + 1e-8 - # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity - norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) - norm_y = xp.linalg.norm(y, axis=1, keepdims=True) - mul_norms = norm_yh * norm_y - cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms - d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) - losses = xp.abs(cosine - 1) - if ignore_zeros: - # If the target was a zero vector, don't count it in the loss. - d_yh[zero_indices] = 0 - losses[zero_indices] = 0 - loss = losses.sum() - return loss, -d_yh diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 778453711..0f7677fd2 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,6 +4,7 @@ from .link import link # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 from .train import train # noqa: F401 +from .train_from_config import train_from_config_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 9e2fc5b1c..109b135b5 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -4,19 +4,21 @@ import time import re from collections import Counter from pathlib import Path -from thinc.v2v import Affine, Maxout -from thinc.misc import LayerNorm as LN -from thinc.neural.util import prefer_gpu +from thinc.layers import Linear, Maxout +from thinc.util import prefer_gpu from wasabi import msg import srsly +from thinc.layers import chain, list2array +from thinc.loss import CosineDistance, L2Distance from spacy.gold import Example from ..errors import Errors from ..tokens import Doc from ..attrs import ID, HEAD -from .._ml import Tok2Vec, flatten, chain, create_default_optimizer -from .._ml import masked_language_model, get_cossim_loss +from ..ml.component_models import Tok2Vec +from ..ml.component_models import masked_language_model from .. import util +from ..util import create_default_optimizer from .train import _load_pretrained_tok2vec @@ -99,7 +101,7 @@ def pretrain( with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) msg.good(f"Loaded model '{vectors_model}'") - pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name + pretrained_vectors = None if not use_vectors else nlp.vocab.vectors model = create_pretraining_model( nlp, Tok2Vec( @@ -136,7 +138,7 @@ def pretrain( # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 - optimizer = create_default_optimizer(model.ops) + optimizer = create_default_optimizer() tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} @@ -251,13 +253,14 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] + # TODO: this code originally didn't normalize, but shouldn't normalize=True ? if objective == "L2": - d_target = prediction - target - loss = (d_target ** 2).sum() + distance = L2Distance(normalize=False) elif objective == "cosine": - loss, d_target = get_cossim_loss(prediction, target) + distance = CosineDistance(normalize=False) else: raise ValueError(Errors.E142.format(loss_func=objective)) + d_target, loss = distance(prediction, target) return loss, d_target @@ -269,18 +272,18 @@ def create_pretraining_model(nlp, tok2vec): """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) + Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann # "tok2vec" has to be the same set of processes as what the components do. - tok2vec = chain(tok2vec, flatten) + tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) model = masked_language_model(nlp.vocab, model) - model.tok2vec = tok2vec - model.output_layer = output_layer - model.begin_training([nlp.make_doc("Give it a doc to infer shapes")]) + model.set_ref("tok2vec", tok2vec) + model.set_ref("output_layer", output_layer) + model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) return model diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 44e59971a..5b7a02212 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -5,7 +5,7 @@ import cProfile import pstats import sys import itertools -import thinc.extra.datasets +import ml_datasets from wasabi import msg from ..util import load_model @@ -29,7 +29,7 @@ def profile( if inputs is None: n_inputs = 25000 with msg.loading("Loading IMDB dataset via Thinc..."): - imdb_train, _ = thinc.extra.datasets.imdb() + imdb_train, _ = ml_datasets.imdb() inputs, _ = zip(*imdb_train) msg.info(f"Loaded IMDB dataset and using {n_inputs} examples") inputs = inputs[:n_inputs] diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ebf5d37d..a83ca158d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,7 +1,7 @@ import os import tqdm from pathlib import Path -from thinc.neural._classes.model import Model +from thinc.backends import use_ops from timeit import default_timer as timer import shutil import srsly @@ -9,7 +9,7 @@ from wasabi import msg import contextlib import random -from .._ml import create_default_optimizer +from ..util import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus from .. import util @@ -200,7 +200,7 @@ def train( if base_model: # Start with an existing model, use default optimizer - optimizer = create_default_optimizer(Model.ops) + optimizer = create_default_optimizer() else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) @@ -367,7 +367,7 @@ def train( cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) - with Model.use_device("cpu"): + with use_ops("numpy"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py new file mode 100644 index 000000000..0488dd04c --- /dev/null +++ b/spacy/cli/train_from_config.py @@ -0,0 +1,445 @@ +import plac +from thinc.util import require_gpu +from wasabi import msg +from pathlib import Path +import thinc +import thinc.schedules +from thinc.model import Model +from spacy.gold import GoldCorpus +import spacy +from spacy.pipeline.tok2vec import Tok2VecListener +from typing import Optional, Dict, List, Union, Sequence +from pydantic import BaseModel, FilePath, StrictInt +import tqdm + +from ..ml import component_models +from .. import util + +registry = util.registry + +CONFIG_STR = """ +[training] +patience = 10 +eval_frequency = 10 +dropout = 0.2 +init_tok2vec = null +vectors = null +max_epochs = 100 +orth_variant_level = 0.0 +gold_preproc = false +max_length = 0 +use_gpu = 0 +scores = ["ents_p", "ents_r", "ents_f"] +score_weights = {"ents_f": 1.0} +limit = 0 + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "transition_based_ner.v1" +nr_feature_tokens = 3 +hidden_width = 64 +maxout_pieces = 3 + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "tok2vec_tensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "hash_embed_cnn.v1" +pretrained_vectors = ${nlp:vectors} +width = 128 +depth = 4 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +""" + + +class PipelineComponent(BaseModel): + factory: str + model: Model + + class Config: + arbitrary_types_allowed = True + + +class ConfigSchema(BaseModel): + optimizer: Optional["Optimizer"] + + class training(BaseModel): + patience: int = 10 + eval_frequency: int = 100 + dropout: float = 0.2 + init_tok2vec: Optional[FilePath] = None + vectors: Optional[str] = None + max_epochs: int = 100 + orth_variant_level: float = 0.0 + gold_preproc: bool = False + max_length: int = 0 + use_gpu: int = 0 + scores: List[str] = ["ents_p", "ents_r", "ents_f"] + score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} + limit: int = 0 + batch_size: Union[Sequence[int], int] + + class nlp(BaseModel): + lang: str + vectors: Optional[str] + pipeline: Optional[Dict[str, PipelineComponent]] + + class Config: + extra = "allow" + + +# Of course, these would normally decorate the functions where they're defined. +# But for now... +@registry.architectures.register("hash_embed_cnn.v1") +def hash_embed_cnn( + pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size +): + return component_models.Tok2Vec( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + conv_depth=depth, + cnn_maxout_pieces=maxout_pieces, + bilstm_depth=0, + window_size=window_size, + ) + + +@registry.architectures.register("hash_embed_bilstm.v1") +def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size): + return component_models.Tok2Vec( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + bilstm_depth=depth, + conv_depth=0, + cnn_maxout_pieces=0, + ) + + +@registry.architectures.register("tagger_model.v1") +def build_tagger_model_v1(tok2vec): + return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec) + + +@registry.architectures.register("transition_based_parser.v1") +def create_tb_parser_model( + tok2vec: Model, + nr_feature_tokens: StrictInt = 3, + hidden_width: StrictInt = 64, + maxout_pieces: StrictInt = 3, +): + from thinc.layers import Linear, chain, list2array + from spacy.ml._layers import PrecomputableAffine + from spacy.syntax._parser_model import ParserModel + from thinc.api import use_ops, zero_init + + token_vector_width = tok2vec.get_dim("nO") + tok2vec = chain(tok2vec, list2array()) + tok2vec.set_dim("nO", token_vector_width) + + lower = PrecomputableAffine( + hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces + ) + lower.set_dim("nP", maxout_pieces) + with use_ops("numpy"): + # Initialize weights at zero, as it's a classification layer. + upper = Linear(init_W=zero_init) + return ParserModel(tok2vec, lower, upper) + + +@plac.annotations( + # fmt: off + train_path=("Location of JSON-formatted training data", "positional", None, Path), + dev_path=("Location of JSON-formatted development data", "positional", None, Path), + config_path=("Path to config file", "positional", None, Path), + output_path=("Output directory to store model in", "option", "o", Path), + meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), + raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), + # fmt: on +) +def train_from_config_cli( + train_path, + dev_path, + config_path, + output_path=None, + meta_path=None, + raw_text=None, + debug=False, + verbose=False, +): + """ + Train or update a spaCy model. Requires data to be formatted in spaCy's + JSON format. To convert data from other formats, use the `spacy convert` + command. + """ + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) + if not train_path or not train_path.exists(): + msg.fail("Training data not found", train_path, exits=1) + if not dev_path or not dev_path.exists(): + msg.fail("Development data not found", dev_path, exits=1) + if meta_path is not None and not meta_path.exists(): + msg.fail("Can't find model meta.json", meta_path, exits=1) + if output_path is not None and not output_path.exists(): + output_path.mkdir() + + try: + train_from_config( + config_path, + {"train": train_path, "dev": dev_path}, + output_path=output_path, + meta_path=meta_path, + raw_text=raw_text, + ) + except KeyboardInterrupt: + msg.warn("Cancelled.") + + +def train_from_config( + config_path, + data_paths, + raw_text=None, + meta_path=None, + output_path=None, +): + msg.info("Loading config from: {}".format(config_path)) + config = util.load_from_config(config_path, create_objects=True) + use_gpu = config["training"]["use_gpu"] + if use_gpu >= 0: + msg.info("Using GPU") + else: + msg.info("Using CPU") + msg.info("Creating nlp from config") + nlp = create_nlp_from_config(**config["nlp"]) + optimizer = config["optimizer"] + limit = config["training"]["limit"] + msg.info("Loading training corpus") + corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) + msg.info("Initializing the nlp pipeline") + nlp.begin_training( + lambda: corpus.train_examples, device=use_gpu + ) + + train_batches = create_train_batches(nlp, corpus, config["training"]) + evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"]) + + # Create iterator, which yields out info after each optimization step. + msg.info("Start training") + training_step_iterator = train_while_improving( + nlp, + optimizer, + train_batches, + evaluate, + config["training"]["dropout"], + config["training"]["patience"], + config["training"]["eval_frequency"], + ) + + msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate)) + print_row = setup_printer(config) + + try: + progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False) + for batch, info, is_best_checkpoint in training_step_iterator: + progress.update(1) + if is_best_checkpoint is not None: + progress.close() + print_row(info) + if is_best_checkpoint and output_path is not None: + nlp.to_disk(output_path) + progress = tqdm.tqdm( + total=config["training"]["eval_frequency"], leave=False + ) + finally: + if output_path is not None: + with nlp.use_params(optimizer.averages): + final_model_path = output_path / "model-final" + nlp.to_disk(final_model_path) + msg.good("Saved model to output directory", final_model_path) + # with msg.loading("Creating best model..."): + # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) + # msg.good("Created best model", best_model_path) + + +def create_nlp_from_config(lang, vectors, pipeline): + lang_class = spacy.util.get_lang_class(lang) + nlp = lang_class() + if vectors is not None: + spacy.cli.train._load_vectors(nlp, vectors) + for name, component_cfg in pipeline.items(): + factory = component_cfg.pop("factory") + component = nlp.create_pipe(factory, config=component_cfg) + nlp.add_pipe(component, name=name) + return nlp + + +def create_train_batches(nlp, corpus, cfg): + while True: + train_examples = corpus.train_dataset( + nlp, + noise_level=0.0, + orth_variant_level=cfg["orth_variant_level"], + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"], + ignore_misaligned=True, + ) + for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]): + yield batch + + +def create_evaluation_callback(nlp, optimizer, corpus, cfg): + def evaluate(): + with nlp.use_params(optimizer.averages): + dev_examples = list( + corpus.dev_dataset( + nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True + ) + ) + scorer = nlp.evaluate(dev_examples) + scores = scorer.scores + # Calculate a weighted sum based on score_weights for the main score + weights = cfg["score_weights"] + weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + return weighted_score, scorer.scores + + return evaluate + + +def train_while_improving( + nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency +): + """Train until an evaluation stops improving. Works as a generator, + with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, + where info is a dict, and is_best_checkpoint is in [True, False, None] -- + None indicating that the iteration was not evaluated as a checkpoint. + The evaluation is conducted by calling the evaluate callback, which should + + Positional arguments: + nlp: The spaCy pipeline to evaluate. + train_data (Iterable[Batch]): A generator of batches, with the training + data. Each batch should be a Sized[Tuple[Input, Annot]]. The training + data iterable needs to take care of iterating over the epochs and + shuffling. + evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. + The callback should take no arguments and return a tuple + `(main_score, other_scores)`. The main_score should be a float where + higher is better. other_scores can be any object. + + Every iteration, the function yields out a tuple with: + + * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs. + * info: A dict with various information about the last update (see below). + * is_best_checkpoint: A value in None, False, True, indicating whether this + was the best evaluation so far. You should use this to save the model + checkpoints during training. If None, evaluation was not conducted on + that iteration. False means evaluation was conducted, but a previous + evaluation was better. + + The info dict provides the following information: + + epoch (int): How many passes over the data have been completed. + step (int): How many steps have been completed. + score (float): The main score form the last evaluation. + other_scores: : The other scores from the last evaluation. + loss: The accumulated losses throughout training. + checkpoints: A list of previous results, where each result is a + (score, step, epoch) tuple. + """ + if isinstance(dropout, float): + dropouts = thinc.schedules.constant(dropout) + else: + dropouts = dropout + results = [] + losses = {} + for step, batch in enumerate(train_data): + dropout = next(dropouts) + for subbatch in subdivide_batch(batch): + nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) + for name, proc in nlp.pipeline: + if hasattr(proc, "model"): + proc.model.finish_update(optimizer) + optimizer.step_schedules() + if not (step % eval_frequency): + score, other_scores = evaluate() + results.append((score, step)) + is_best_checkpoint = score == max(results)[0] + else: + score, other_scores = (None, None) + is_best_checkpoint = None + info = { + "step": step, + "score": score, + "other_scores": other_scores, + "losses": losses, + "checkpoints": results, + } + yield batch, info, is_best_checkpoint + if is_best_checkpoint is not None: + losses = {} + # Stop if no improvement in `patience` updates + best_score, best_step = max(results) + if (step - best_step) >= patience: + break + + +def subdivide_batch(batch): + return [batch] + + +def setup_printer(config): + score_cols = config["training"]["scores"] + score_widths = [max(len(col), 6) for col in score_cols] + loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]] + loss_widths = [max(len(col), 8) for col in loss_cols] + table_header = ["#"] + loss_cols + score_cols + ["Score"] + table_header = [col.upper() for col in table_header] + table_widths = [6] + loss_widths + score_widths + [6] + table_aligns = ["r" for _ in table_widths] + + msg.row(table_header, widths=table_widths) + msg.row(["-" * width for width in table_widths]) + + def print_row(info): + losses = [ + "{0:.2f}".format(info["losses"].get(col, 0.0)) + for col in config["nlp"]["pipeline"] + ] + scores = [ + "{0:.2f}".format(info["other_scores"].get(col, 0.0)) + for col in config["training"]["scores"] + ] + data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])] + msg.row(data, widths=table_widths, aligns=table_aligns) + + return print_row + + +@registry.architectures.register("tok2vec_tensors.v1") +def tok2vec_tensors_v1(width): + tok2vec = Tok2VecListener("tok2vec", width=width) + return tok2vec diff --git a/spacy/compat.py b/spacy/compat.py index 8cb08ae09..6fa49353e 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -8,7 +8,7 @@ DOCS: https://spacy.io/api/top-level#compat import os import sys -from thinc.neural.util import copy_array +from thinc.util import copy_array try: import cPickle as pickle @@ -30,10 +30,7 @@ try: except ImportError: cupy = None -try: - from thinc.neural.optimizers import Optimizer # noqa: F401 -except ImportError: - from thinc.neural.optimizers import Adam as Optimizer # noqa: F401 +from thinc.optimizers import Optimizer # noqa: F401 pickle = pickle copy_reg = copy_reg diff --git a/spacy/language.py b/spacy/language.py index b91903595..cde9c0164 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,7 +4,8 @@ import weakref import functools from contextlib import contextmanager from copy import copy, deepcopy -from thinc.neural import Model +from thinc.model import Model +from thinc.backends import get_current_ops import srsly import multiprocessing as mp from itertools import chain, cycle @@ -16,7 +17,7 @@ from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer -from ._ml import link_vectors_to_models, create_default_optimizer +from .util import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -468,30 +469,27 @@ class Language(object): if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer(Model.ops) + self._optimizer = create_default_optimizer() sgd = self._optimizer - grads = {} - - def get_grads(W, dW, key=None): - grads[key] = (W, dW) - - get_grads.alpha = sgd.alpha - get_grads.b1 = sgd.b1 - get_grads.b2 = sgd.b2 - pipes = list(self.pipeline) - random.shuffle(pipes) if component_cfg is None: component_cfg = {} - for name, proc in pipes: + # Determine whether component should set annotations. In theory I guess + # we should do this by inspecting the meta? Or we could just always + # say "yes" + for name, proc in self.pipeline: + component_cfg.setdefault(name, {}) + component_cfg[name].setdefault("drop", drop) + component_cfg[name].setdefault("set_annotations", False) + grads = {} + for name, proc in self.pipeline: if not hasattr(proc, "update"): continue - grads = {} - kwargs = component_cfg.get(name, {}) - kwargs.setdefault("drop", drop) - proc.update(examples, sgd=get_grads, losses=losses, **kwargs) - for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) + if sgd is not False: + for name, proc in self.pipeline: + if hasattr(proc, "model"): + proc.model.finish_update(sgd) def rehearse(self, examples, sgd=None, losses=None, config=None): """Make a "rehearsal" update to the models in the pipeline, to prevent @@ -518,7 +516,7 @@ class Language(object): examples = Example.to_example_objects(examples, make_doc=self.make_doc) if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer(Model.ops) + self._optimizer = create_default_optimizer() sgd = self._optimizer pipes = list(self.pipeline) random.shuffle(pipes) @@ -529,7 +527,7 @@ class Language(object): def get_grads(W, dW, key=None): grads[key] = (W, dW) - get_grads.alpha = sgd.alpha + get_grads.learn_rate = sgd.learn_rate get_grads.b1 = sgd.b1 get_grads.b2 = sgd.b2 for name, proc in pipes: @@ -537,8 +535,8 @@ class Language(object): continue grads = {} proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {})) - for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) + for key, (W, dW) in grads.items(): + sgd(W, dW, key=key) return losses def preprocess_gold(self, examples): @@ -577,12 +575,13 @@ class Language(object): if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data) + ops = get_current_ops() + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors.name + cfg["pretrained_vectors"] = self.vocab.vectors if sgd is None: - sgd = create_default_optimizer(Model.ops) + sgd = create_default_optimizer() self._optimizer = sgd if component_cfg is None: component_cfg = {} @@ -596,6 +595,7 @@ class Language(object): sgd=self._optimizer, **kwargs ) + self._link_components() return self._optimizer def resume_training(self, sgd=None, **cfg): @@ -609,13 +609,14 @@ class Language(object): """ if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) + ops = get_current_ops() if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data) + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors.name + cfg["pretrained_vectors"] = self.vocab.vectors if sgd is None: - sgd = create_default_optimizer(Model.ops) + sgd = create_default_optimizer() self._optimizer = sgd for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): @@ -736,7 +737,7 @@ class Language(object): disable=disable, n_process=n_process, component_cfg=component_cfg, - as_example=False + as_example=False # TODO: shouldn't this be as_example=as_example ? ) for doc, context in zip(docs, contexts): yield (doc, context) @@ -838,6 +839,16 @@ class Language(object): for proc in procs: proc.terminate() + def _link_components(self): + """Register 'listeners' within pipeline components, to allow them to + effectively share weights. + """ + for i, (name1, proc1) in enumerate(self.pipeline): + if hasattr(proc1, "find_listeners"): + for name2, proc2 in self.pipeline[i:]: + if hasattr(proc2, "model"): + proc1.find_listeners(proc2.model) + def to_disk(self, path, exclude=tuple(), disable=None): """Save the current state to a directory. If a model is loaded, this will include the model. @@ -906,6 +917,7 @@ class Language(object): exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) self._path = path + self._link_components() return self def to_bytes(self, exclude=tuple(), disable=None, **kwargs): @@ -962,6 +974,7 @@ class Language(object): ) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_bytes(bytes_data, deserializers, exclude) + self._link_components() return self diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 497e20516..1292a46bd 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -6,7 +6,7 @@ cimport numpy as np np.import_array() import numpy -from thinc.neural.util import get_array_module +from thinc.util import get_array_module from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index 8eebf0564..e69de29bb 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1,2 +0,0 @@ -from .tok2vec import Tok2Vec # noqa: F401 -from .common import FeedForward, LayerNormalizedMaxout # noqa: F401 diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py new file mode 100644 index 000000000..2ff67746f --- /dev/null +++ b/spacy/ml/_character_embed.py @@ -0,0 +1,52 @@ +from thinc.api import Model + + +def CharacterEmbed(nM, nC): + # nM: Number of dimensions per character. nC: Number of characters. + nO = nM*nC if (nM is not None and nC is not None) else None + return Model( + "charembed", + forward, + init=init, + dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256}, + params={"E": None} + ).initialize() + + +def init(model, X=None, Y=None): + vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")) + model.set_param("E", vectors_table) + + +def forward(model, docs, is_train): + if not docs: + return [] + ids = [] + output = [] + E = model.get_param("E") + nC = model.get_dim("nC") + nM = model.get_dim("nM") + nO = model.get_dim("nO") + # This assists in indexing; it's like looping over this dimension. + # Still consider this weird witch craft...But thanks to Mark Neumann + # for the tip. + nCv = model.ops.xp.arange(nC) + for doc in docs: + doc_ids = doc.to_utf8_array(nr_char=nC) + doc_vectors = model.ops.alloc3f(len(doc), nC, nM) + # Let's say I have a 2d array of indices, and a 3d table of data. What numpy + # incantation do I chant to get + # output[i, j, k] == data[j, ids[i, j], k]? + doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] + output.append(doc_vectors.reshape((len(doc), nO))) + ids.append(doc_ids) + + def backprop(d_output): + dE = model.ops.alloc(E.shape, dtype=E.dtype) + for doc_ids, d_doc_vectors in zip(ids, d_output): + d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM)) + dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] + model.inc_grad("E", dE) + return [] + + return output, backprop diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py new file mode 100644 index 000000000..e6aa798e7 --- /dev/null +++ b/spacy/ml/_layers.py @@ -0,0 +1,165 @@ +from thinc.model import Model +from thinc.api import normal_init + + +def PrecomputableAffine(nO, nI, nF, nP): + model = Model( + "precomputable_affine", + forward, + init=init, + dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, + params={"W": None, "b": None, "pad": None}, + ) + model.initialize() + return model + + +def forward(model, X, is_train): + nF = model.get_dim("nF") + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + W = model.get_param("W") + Yf = model.ops.gemm( + X, W.reshape((nF * nO * nP, nI)), trans2=True + ) + Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) + Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) + + def backward(dY_ids): + # This backprop is particularly tricky, because we get back a different + # thing from what we put out. We put out an array of shape: + # (nB, nF, nO, nP), and get back: + # (nB, nO, nP) and ids (nB, nF) + # The ids tell us the values of nF, so we would have: + # + # dYf = zeros((nB, nF, nO, nP)) + # for b in range(nB): + # for f in range(nF): + # dYf[b, ids[b, f]] += dY[b] + # + # However, we avoid building that array for efficiency -- and just pass + # in the indices. + dY, ids = dY_ids + assert dY.ndim == 3 + assert dY.shape[1] == nO, dY.shape + assert dY.shape[2] == nP, dY.shape + nB = dY.shape[0] + model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids)) + Xf = X[ids] + Xf = Xf.reshape((Xf.shape[0], nF * nI)) + + model.inc_grad("b", dY.sum(axis=0)) + dY = dY.reshape((dY.shape[0], nO * nP)) + + Wopfi = W.transpose((1, 2, 0, 3)) + Wopfi = model.ops.xp.ascontiguousarray(Wopfi) + Wopfi = Wopfi.reshape((nO * nP, nF * nI)) + dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) + + # Reuse the buffer + dWopfi = Wopfi + dWopfi.fill(0.0) + model.ops.gemm(dY, Xf, out=dWopfi, trans1=True) + dWopfi = dWopfi.reshape((nO, nP, nF, nI)) + # (o, p, f, i) --> (f, o, p, i) + model.inc_grad("W", dWopfi.transpose((2, 0, 1, 3))) + return dXf.reshape((dXf.shape[0], nF, nI)) + + return Yf, backward + + +def _backprop_precomputable_affine_padding(model, dY, ids): + nB = dY.shape[0] + nF = model.get_dim("nF") + nP = model.get_dim("nP") + nO = model.get_dim("nO") + # Backprop the "padding", used as a filler for missing values. + # Values that are missing are set to -1, and each state vector could + # have multiple missing values. The padding has different values for + # different missing features. The gradient of the padding vector is: + # + # for b in range(nB): + # for f in range(nF): + # if ids[b, f] < 0: + # d_padding[0, f] += dY[b] + # + # Which can be rewritten as: + # + # for b in range(nB): + # d_pad[0, ids[b] < 0] += dY[b] + # + # I don't know how to avoid the loop without building a whole array :(. + # Cursed numpy. + d_pad = model.ops.alloc((1, nF, nO, nP)) + for b in range(nB): + d_pad[0, ids[b] < 0] += dY[b] + return d_pad + + +def init(model, X=None, Y=None): + """This is like the 'layer sequential unit variance', but instead + of taking the actual inputs, we randomly generate whitened data. + + Why's this all so complicated? We have a huge number of inputs, + and the maxout unit makes guessing the dynamics tricky. Instead + we set the maxout weights to values that empirically result in + whitened outputs given whitened inputs. + """ + if model.has_param("W") and model.get_param("W").any(): + return + + nF = model.get_dim("nF") + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + W = model.ops.alloc4f(nF, nO, nP, nI) + b = model.ops.alloc2f(nO, nP) + pad = model.ops.alloc4f(1, nF, nO, nP) + + ops = model.ops + W = normal_init(ops, W.shape, fan_in=nF*nI) + model.set_param("W", W) + model.set_param("b", b) + model.set_param("pad", pad) + + ids = ops.alloc((5000, nF), dtype="f") + ids += ops.xp.random.uniform(0, 1000, ids.shape) + ids = ops.asarray(ids, dtype="i") + tokvecs = ops.alloc((5000, nI), dtype="f") + tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( + tokvecs.shape + ) + + def predict(ids, tokvecs): + # nS ids. nW tokvecs. Exclude the padding array. + hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p) + vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f") + # need nS vectors + hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP)) + model.ops.scatter_add(vectors, ids.flatten(), hiddens) + vectors = vectors.reshape((vectors.shape[0], nO, nP)) + vectors += b + vectors = model.ops.asarray(vectors) + if nP >= 2: + return model.ops.maxout(vectors)[0] + else: + return vectors * (vectors >= 0) + + tol_var = 0.01 + tol_mean = 0.01 + t_max = 10 + W = model.get_param("W").copy() + b = model.get_param("b").copy() + for t_i in range(t_max): + acts1 = predict(ids, tokvecs) + var = model.ops.xp.var(acts1) + mean = model.ops.xp.mean(acts1) + if abs(var - 1.0) >= tol_var: + W /= model.ops.xp.sqrt(var) + model.set_param("W", W) + elif abs(mean) >= tol_mean: + b -= mean + model.set_param("b", b) + else: + break diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py deleted file mode 100644 index e7baae380..000000000 --- a/spacy/ml/_legacy_tok2vec.py +++ /dev/null @@ -1,129 +0,0 @@ -from thinc.v2v import Model, Maxout -from thinc.i2v import HashEmbed, StaticVectors -from thinc.t2t import ExtractWindow -from thinc.misc import Residual -from thinc.misc import LayerNorm as LN -from thinc.misc import FeatureExtracter -from thinc.api import layerize, chain, clone, concatenate, with_flatten -from thinc.api import uniqued, wrap, noop - -from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE - - -def Tok2Vec(width, embed_size, **kwargs): - # Circular imports :( - from .._ml import CharacterEmbed - from .._ml import PyTorchBiLSTM - - pretrained_vectors = kwargs.get("pretrained_vectors", None) - cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) - subword_features = kwargs.get("subword_features", True) - char_embed = kwargs.get("char_embed", False) - if char_embed: - subword_features = False - conv_depth = kwargs.get("conv_depth", 4) - bilstm_depth = kwargs.get("bilstm_depth", 0) - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm") - if subword_features: - prefix = HashEmbed( - width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix" - ) - suffix = HashEmbed( - width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix" - ) - shape = HashEmbed( - width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape" - ) - else: - prefix, suffix, shape = (None, None, None) - if pretrained_vectors is not None: - glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - - if subword_features: - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> LN(Maxout(width, width * 5, pieces=3)), - column=cols.index(ORTH), - ) - else: - embed = uniqued( - (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)), - column=cols.index(ORTH), - ) - elif subword_features: - embed = uniqued( - (norm | prefix | suffix | shape) - >> LN(Maxout(width, width * 4, pieces=3)), - column=cols.index(ORTH), - ) - elif char_embed: - embed = concatenate_lists( - CharacterEmbed(nM=64, nC=8), - FeatureExtracter(cols) >> with_flatten(norm), - ) - reduce_dimensions = LN( - Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces) - ) - else: - embed = norm - - convolution = Residual( - ExtractWindow(nW=1) - >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) - ) - if char_embed: - tok2vec = embed >> with_flatten( - reduce_dimensions >> convolution ** conv_depth, pad=conv_depth - ) - else: - tok2vec = FeatureExtracter(cols) >> with_flatten( - embed >> convolution ** conv_depth, pad=conv_depth - ) - - if bilstm_depth >= 1: - tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) - # Work around thinc API limitations :(. TODO: Revise in Thinc 7 - tok2vec.nO = width - tok2vec.embed = embed - return tok2vec - - -@layerize -def flatten(seqs, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=0) - - X = ops.flatten(seqs, pad=0) - return X, finish_update - - -def concatenate_lists(*layers, **kwargs): # pragma: no cover - """Compose two or more models `f`, `g`, etc, such that their outputs are - concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - """ - if not layers: - return noop() - drop_factor = kwargs.get("drop_factor", 1.0) - ops = layers[0].ops - layers = [chain(layer, flatten) for layer in layers] - concat = concatenate(*layers) - - def concatenate_lists_fwd(Xs, drop=0.0): - if drop is not None: - drop *= drop_factor - lengths = ops.asarray([len(X) for X in Xs], dtype="i") - flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) - ys = ops.unflatten(flat_y, lengths) - - def concatenate_lists_bwd(d_ys, sgd=None): - return bp_flat_y(ops.flatten(d_ys), sgd=sgd) - - return ys, concatenate_lists_bwd - - model = wrap(concatenate_lists_fwd, concat) - return model diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py deleted file mode 100644 index 2b1144fcb..000000000 --- a/spacy/ml/_wire.py +++ /dev/null @@ -1,41 +0,0 @@ -from thinc.api import layerize, wrap, noop, chain, concatenate -from thinc.v2v import Model - - -def concatenate_lists(*layers, **kwargs): # pragma: no cover - """Compose two or more models `f`, `g`, etc, such that their outputs are - concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - """ - if not layers: - return layerize(noop()) - drop_factor = kwargs.get("drop_factor", 1.0) - ops = layers[0].ops - layers = [chain(layer, flatten) for layer in layers] - concat = concatenate(*layers) - - def concatenate_lists_fwd(Xs, drop=0.0): - if drop is not None: - drop *= drop_factor - lengths = ops.asarray([len(X) for X in Xs], dtype="i") - flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) - ys = ops.unflatten(flat_y, lengths) - - def concatenate_lists_bwd(d_ys, sgd=None): - return bp_flat_y(ops.flatten(d_ys), sgd=sgd) - - return ys, concatenate_lists_bwd - - model = wrap(concatenate_lists_fwd, concat) - return model - - -@layerize -def flatten(seqs, drop=0.0): - ops = Model.ops - lengths = ops.asarray([len(seq) for seq in seqs], dtype="i") - - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths, pad=0) - - X = ops.flatten(seqs, pad=0) - return X, finish_update diff --git a/spacy/ml/common.py b/spacy/ml/common.py deleted file mode 100644 index 4ecb00e4e..000000000 --- a/spacy/ml/common.py +++ /dev/null @@ -1,21 +0,0 @@ -from thinc.api import chain -from thinc.v2v import Maxout -from thinc.misc import LayerNorm -from ..util import registry, make_layer - - -@registry.architectures.register("thinc.FeedForward.v1") -def FeedForward(config): - layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]] - model = chain(*layers) - model.cfg = config - return model - - -@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") -def LayerNormalizedMaxout(config): - width = config["width"] - pieces = config["pieces"] - layer = LayerNorm(Maxout(width, pieces=pieces)) - layer.nO = width - return layer diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py new file mode 100644 index 000000000..a24c2bfce --- /dev/null +++ b/spacy/ml/component_models.py @@ -0,0 +1,222 @@ +from spacy import util +from spacy.ml.extract_ngrams import extract_ngrams + +from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE +from ..errors import Errors +from ._character_embed import CharacterEmbed + +from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged +from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors +from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain +from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued +from thinc.api import zero_init, glorot_uniform_init + + +def build_text_classifier(arch, config): + if arch == "cnn": + return build_simple_cnn_text_classifier(**config) + elif arch == "bow": + return build_bow_text_classifier(**config) + else: + raise ValueError("Unexpected textcat arch") + + +def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg): + """ + Build a simple CNN text classifier, given a token-to-vector model as inputs. + If exclusive_classes=True, a softmax non-linearity is applied, so that the + outputs sum to 1. If exclusive_classes=False, a logistic non-linearity + is applied instead, so that outputs are in the range [0, 1]. + """ + with Model.define_operators({">>": chain}): + if exclusive_classes: + output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO")) + else: + # TODO: experiment with init_w=zero_init + output_layer = ( + Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) + >> Logistic() + ) + model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model.set_ref("tok2vec", tok2vec) + model.set_dim("nO", nr_class) + return model + + +def build_bow_text_classifier( + nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg +): + with Model.define_operators({">>": chain}): + model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class) + model.to_cpu() + if not no_output_layer: + output_layer = ( + Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class) + ) + output_layer.to_cpu() + model = model >> output_layer + model.set_dim("nO", nr_class) + return model + + +def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): + if "entity_width" not in cfg: + raise ValueError(Errors.E144.format(param="entity_width")) + + conv_depth = cfg.get("conv_depth", 2) + cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) + pretrained_vectors = cfg.get("pretrained_vectors", None) + context_width = cfg.get("entity_width") + + with Model.define_operators({">>": chain, "**": clone}): + nel_tok2vec = Tok2Vec( + width=hidden_width, + embed_size=embed_width, + pretrained_vectors=pretrained_vectors, + cnn_maxout_pieces=cnn_maxout_pieces, + subword_features=True, + conv_depth=conv_depth, + bilstm_depth=0, + ) + + model = ( + nel_tok2vec + >> list2ragged() + >> reduce_mean() + >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0)) + >> Linear(nO=context_width, nI=hidden_width) + ) + model.initialize() + + model.set_ref("tok2vec", nel_tok2vec) + model.set_dim("nO", context_width) + return model + + +def masked_language_model(*args, **kwargs): + raise NotImplementedError + + +def build_tagger_model(nr_class, tok2vec): + token_vector_width = tok2vec.get_dim("nO") + # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! + softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init)) + model = chain(tok2vec, softmax) + model.set_ref("tok2vec", tok2vec) + model.set_ref("softmax", softmax) + return model + + +def build_morphologizer_model(class_nums, **cfg): + embed_size = util.env_opt("embed_size", 7000) + if "token_vector_width" in cfg: + token_vector_width = cfg["token_vector_width"] + else: + token_vector_width = util.env_opt("token_vector_width", 128) + pretrained_vectors = cfg.get("pretrained_vectors") + char_embed = cfg.get("char_embed", True) + with Model.define_operators({">>": chain, "+": add, "**": clone}): + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] + else: + tok2vec = Tok2Vec( + token_vector_width, + embed_size, + char_embed=char_embed, + pretrained_vectors=pretrained_vectors, + ) + softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width)) + model = tok2vec >> softmax + model.set_ref("tok2vec", tok2vec) + model.set_ref("softmax", softmax) + return model + + +def Tok2Vec( + width, + embed_size, + pretrained_vectors=None, + window_size=1, + cnn_maxout_pieces=3, + subword_features=True, + char_embed=False, + conv_depth=4, + bilstm_depth=0, +): + if char_embed: + subword_features = False + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0) + if subword_features: + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0) + else: + prefix, suffix, shape = (None, None, None) + if pretrained_vectors is not None: + glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0) + + if subword_features: + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> Maxout( + nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True + ), + column=cols.index(ORTH), + ) + else: + embed = uniqued( + (glove | norm) + >> Maxout( + nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True + ), + column=cols.index(ORTH), + ) + elif subword_features: + embed = uniqued( + concatenate(norm, prefix, suffix, shape) + >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True), + column=cols.index(ORTH), + ) + elif char_embed: + embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array( + norm + ) + reduce_dimensions = Maxout( + nO=width, + nI=64 * 8 + width, + nP=cnn_maxout_pieces, + dropout=0.0, + normalize=True, + ) + else: + embed = norm + + convolution = residual( + expand_window(window_size=window_size) + >> Maxout( + nO=width, + nI=width * 3, + nP=cnn_maxout_pieces, + dropout=0.0, + normalize=True, + ) + ) + if char_embed: + tok2vec = embed >> with_array( + reduce_dimensions >> convolution ** conv_depth, pad=conv_depth + ) + else: + tok2vec = FeatureExtractor(cols) >> with_array( + embed >> convolution ** conv_depth, pad=conv_depth + ) + + if bilstm_depth >= 1: + tok2vec = tok2vec >> PyTorchLSTM( + nO=width, nI=width, depth=bilstm_depth, bi=True + ) + # Work around thinc API limitations :(. TODO: Revise in Thinc 7 + tok2vec.set_dim("nO", width) + tok2vec.set_ref("embed", embed) + return tok2vec diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py new file mode 100644 index 000000000..1ec5b5fc1 --- /dev/null +++ b/spacy/ml/extract_ngrams.py @@ -0,0 +1,39 @@ +import numpy +from thinc.model import Model + +from ..attrs import LOWER + + +def extract_ngrams(ngram_size, attr=LOWER) -> Model: + model = Model("extract_ngrams", forward) + model.attrs["ngram_size"] = ngram_size + model.attrs["attr"] = attr + return model + + +def forward(self, docs, is_train: bool): + batch_keys = [] + batch_vals = [] + for doc in docs: + unigrams = doc.to_array([self.attrs["attr"]]) + ngrams = [unigrams] + for n in range(2, self.attrs["ngram_size"] + 1): + ngrams.append(self.ops.ngrams(n, unigrams)) + keys = self.ops.xp.concatenate(ngrams) + keys, vals = self.ops.xp.unique(keys, return_counts=True) + batch_keys.append(keys) + batch_vals.append(vals) + # The dtype here matches what thinc is expecting -- which differs per + # platform (by int definition). This should be fixed once the problem + # is fixed on Thinc's side. + lengths = self.ops.asarray( + [arr.shape[0] for arr in batch_keys], dtype=numpy.int_ + ) + batch_keys = self.ops.xp.concatenate(batch_keys) + batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") + + def backprop(dY): + return dY + + return (batch_keys, batch_vals, lengths), backprop + diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 9a0ed6bf5..102b88604 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,11 +1,12 @@ -from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued -from thinc.api import noop, with_square_sequences -from thinc.v2v import Maxout, Model -from thinc.i2v import HashEmbed, StaticVectors -from thinc.t2t import ExtractWindow -from thinc.misc import Residual, LayerNorm, FeatureExtracter +from thinc.layers import chain, clone, concatenate, with_array, uniqued +from thinc.model import Model +from thinc.layers import noop, with_padded +from thinc.layers import Maxout, expand_window +from thinc.layers import HashEmbed, StaticVectors +from thinc.layers import residual, LayerNorm, FeatureExtractor + +from spacy.ml import _character_embed from ..util import make_layer, registry -from ._wire import concatenate_lists @registry.architectures.register("spacy.Tok2Vec.v1") @@ -13,19 +14,21 @@ def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) - field_size = getattr(encode, "receptive_field", 0) - tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size)) - tok2vec.cfg = config - tok2vec.nO = encode.nO - tok2vec.embed = embed - tok2vec.encode = encode + field_size = 0 + if encode.has_attr("receptive_field"): + field_size = encode.attrs["receptive_field"] + tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) + tok2vec.attrs["cfg"] = config + tok2vec.set_dim("nO", encode.get_dim("nO")) + tok2vec.set_ref("embed", embed) + tok2vec.set_ref("encode", encode) return tok2vec @registry.architectures.register("spacy.Doc2Feats.v1") def Doc2Feats(config): columns = config["columns"] - return FeatureExtracter(columns) + return FeatureExtractor(columns) @registry.architectures.register("spacy.MultiHashEmbed.v1") @@ -40,55 +43,47 @@ def MultiHashEmbed(config): width = config["width"] rows = config["rows"] - norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm") + norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0) if config["use_subwords"]: - prefix = HashEmbed( - width, rows // 2, column=cols.index("PREFIX"), name="embed_prefix" - ) - suffix = HashEmbed( - width, rows // 2, column=cols.index("SUFFIX"), name="embed_suffix" - ) - shape = HashEmbed( - width, rows // 2, column=cols.index("SHAPE"), name="embed_shape" - ) + prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0) + suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0) + shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0) if config.get("@pretrained_vectors"): glove = make_layer(config["@pretrained_vectors"]) mix = make_layer(config["@mix"]) with Model.define_operators({">>": chain, "|": concatenate}): if config["use_subwords"] and config["@pretrained_vectors"]: - mix._layers[0].nI = width * 5 + mix._layers[0].set_dim("nI", width * 5) layer = uniqued( (glove | norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH"), ) elif config["use_subwords"]: - mix._layers[0].nI = width * 4 + mix._layers[0].set_dim("nI", width * 4) layer = uniqued( (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") ) elif config["@pretrained_vectors"]: - mix._layers[0].nI = width * 2 + mix._layers[0].set_dim("nI", width * 2) layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),) else: layer = norm - layer.cfg = config + layer.attrs["cfg"] = config return layer @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(config): - from .. import _ml - width = config["width"] chars = config["chars"] - chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars) + chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) other_tables = make_layer(config["@embed_features"]) mix = make_layer(config["@mix"]) - model = chain(concatenate_lists(chr_embed, other_tables), mix) - model.cfg = config + model = chain(concatenate(chr_embed, other_tables), mix) + model.attrs["cfg"] = config return model @@ -99,48 +94,49 @@ def MaxoutWindowEncoder(config): nP = config["pieces"] depth = config["depth"] - cnn = chain( - ExtractWindow(nW=nW), LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP)) - ) - model = clone(Residual(cnn), depth) - model.nO = nO - model.receptive_field = nW * depth + cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) + model.attrs["receptive_field"] = nW * depth return model @registry.architectures.register("spacy.MishWindowEncoder.v1") def MishWindowEncoder(config): - from thinc.v2v import Mish + from thinc.layers import Mish nO = config["width"] nW = config["window_size"] depth = config["depth"] - cnn = chain(ExtractWindow(nW=nW), LayerNorm(Mish(nO, nO * ((nW * 2) + 1)))) - model = clone(Residual(cnn), depth) - model.nO = nO + cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO)) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) return model @registry.architectures.register("spacy.PretrainedVectors.v1") def PretrainedVectors(config): - return StaticVectors(config["vectors_name"], config["width"], config["column"]) + # TODO: actual vectors instead of name + return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0) @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") def TorchBiLSTMEncoder(config): import torch.nn - from thinc.extra.wrappers import PyTorchWrapperRNN + # TODO FIX + from thinc.layers import PyTorchRNNWrapper width = config["width"] depth = config["depth"] if depth == 0: - return layerize(noop()) - return with_square_sequences( - PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) + return noop() + return with_padded( + PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) ) +# TODO: update _EXAMPLE_CONFIG = { "@doc2feats": { "arch": "Doc2Feats", diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 2f9824eda..6a90de81c 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,6 +3,7 @@ from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer from .morphologizer import Morphologizer from .entityruler import EntityRuler +from .tok2vec import Tok2Vec from .hooks import SentenceSegmenter, SimilarityHook from .functions import merge_entities, merge_noun_chunks, merge_subtokens @@ -13,6 +14,7 @@ __all__ = [ "EntityLinker", "TextCategorizer", "Tensorizer", + "Tok2Vec", "Pipe", "Morphologizer", "EntityRuler", diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 68385c5a9..00c328e81 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -1,9 +1,8 @@ -from thinc.t2v import Pooling, max_pool, mean_pool -from thinc.neural._classes.difference import Siamese, CauchySimilarity +from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity from .pipes import Pipe from ..language import component -from .._ml import link_vectors_to_models +from ..util import link_vectors_to_models @component("sentencizer_hook", assigns=["doc.user_hooks"]) @@ -63,7 +62,10 @@ class SimilarityHook(Pipe): @classmethod def Model(cls, length): - return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) + return siamese( + concatenate(reduce_max(), reduce_mean()), + CauchySimilarity(length * 2) + ) def __call__(self, doc): """Install similarity hook""" @@ -80,7 +82,7 @@ class SimilarityHook(Pipe): def update(self, doc1_doc2, golds, sgd=None, drop=0.0): self.require_model() - sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) + sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): """Allocate model, using width from tensorizer in pipeline. @@ -89,7 +91,7 @@ class SimilarityHook(Pipe): pipeline (list): The pipeline the model is part of. """ if self.model is True: - self.model = self.Model(pipeline[0].model.nO) + self.model = self.Model(pipeline[0].model.get_dim("nO")) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 10038d410..7b9e4b04e 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -3,19 +3,20 @@ from collections import defaultdict import numpy cimport numpy as np -from thinc.api import chain -from thinc.neural.util import to_categorical, copy_array, get_array_module +from thinc.layers import chain, list2array +from thinc.util import to_categorical, copy_array, get_array_module + from .. import util from .pipes import Pipe from ..language import component -from .._ml import Tok2Vec, build_morphologizer_model -from .._ml import link_vectors_to_models, zero_init, flatten -from .._ml import create_default_optimizer +from ..util import link_vectors_to_models, create_default_optimizer from ..errors import Errors, TempErrors from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology +from ..ml.component_models import build_morphologizer_model + @component("morphologizer", assigns=["token.morph", "token.pos"]) class Morphologizer(Pipe): @@ -43,7 +44,7 @@ class Morphologizer(Pipe): if self.model in (None, True, False): return None else: - return chain(self.model.tok2vec, flatten) + return chain(self.model.get_ref("tok2vec"), list2array()) def __call__(self, doc): features, tokvecs = self.predict([doc]) @@ -60,9 +61,9 @@ class Morphologizer(Pipe): def predict(self, docs): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. - n_labels = self.model.nO - guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] - tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) + n_labels = self.model.get_dim("nO") + guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] + tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO"))) return guesses, tokvecs tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) @@ -77,7 +78,7 @@ class Morphologizer(Pipe): for field in self._class_map.fields] for i, doc in enumerate(docs): doc_scores = batch_scores[i] - doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) + doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]) # Convert the neuron indices into feature IDs. doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i') for j in range(len(doc)): @@ -110,7 +111,7 @@ class Morphologizer(Pipe): def get_loss(self, examples, scores): guesses = [] for doc_scores in scores: - guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) + guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])) guesses = self.model.ops.xp.vstack(guesses) scores = self.model.ops.xp.vstack(scores) if not isinstance(scores, numpy.ndarray): @@ -120,7 +121,7 @@ class Morphologizer(Pipe): cdef int idx = 0 # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') - field_sizes = self.model.softmax.out_sizes + field_sizes = self.model.get_ref("softmax").attrs["nOs"] for example in examples: doc = example.doc gold = example.gold diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4f0f2469e..bca53bc03 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,11 +3,11 @@ import numpy import srsly import random -from thinc.api import chain -from thinc.v2v import Affine, Maxout, Softmax -from thinc.misc import LayerNorm -from thinc.neural.util import to_categorical -from thinc.neural.util import get_array_module +from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array +from thinc.initializers import zero_init +from thinc.loss import CosineDistance +from thinc.util import to_categorical, get_array_module +from thinc.model import set_dropout_rate from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser @@ -21,13 +21,14 @@ from ..language import Language, component from ..syntax import nonproj from ..gold import Example from ..attrs import POS, ID +from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X from ..kb import KnowledgeBase -from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss -from .._ml import build_text_classifier, build_simple_cnn_text_classifier -from .._ml import build_bow_text_classifier, build_nel_encoder -from .._ml import link_vectors_to_models, zero_init, flatten -from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss +from ..ml.component_models import Tok2Vec, build_tagger_model +from ..ml.component_models import build_text_classifier +from ..ml.component_models import build_simple_cnn_text_classifier +from ..ml.component_models import build_bow_text_classifier, build_nel_encoder +from ..ml.component_models import masked_language_model from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -126,13 +127,15 @@ class Pipe(object): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, examples, drop=0.0, sgd=None, losses=None): + def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict() and get_loss(). """ - pass + if set_annotations: + docs = (self._get_doc(ex) for ex in examples) + docs = list(self.pipe(docs)) def rehearse(self, examples, sgd=None, losses=None, **config): pass @@ -152,7 +155,7 @@ class Pipe(object): raise NotImplementedError def create_optimizer(self): - return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {})) + return create_default_optimizer() def begin_training( self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs @@ -163,10 +166,30 @@ class Pipe(object): self.model = self.Model(**self.cfg) if hasattr(self, "vocab"): link_vectors_to_models(self.vocab) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd + def get_gradients(self): + """Get non-zero gradients of the model's parameters, as a dictionary + keyed by the parameter ID. The values are (weights, gradients) tuples. + """ + gradients = {} + if self.model in (None, True, False): + return gradients + queue = [self.model] + seen = set() + for node in queue: + if node.id in seen: + continue + seen.add(node.id) + if hasattr(node, "_mem") and node._mem.gradient.any(): + gradients[node.id] = [node._mem.weights, node._mem.gradient] + if hasattr(node, "_layers"): + queue.extend(node._layers) + return gradients + def use_params(self, params): """Modify the pipe's model, to use the given parameter values.""" with self.model.use_params(params): @@ -193,7 +216,7 @@ class Pipe(object): def load_model(b): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: self.model = self.Model(**self.cfg) try: @@ -226,7 +249,7 @@ class Pipe(object): def load_model(p): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: self.model = self.Model(**self.cfg) try: @@ -254,10 +277,10 @@ class Tensorizer(Pipe): width (int): Output size of the model. embed_size (int): Number of vectors in the embedding table. **cfg: Config parameters. - RETURNS (Model): A `thinc.neural.Model` or similar instance. + RETURNS (Model): A `thinc.model.Model` or similar instance. """ input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96)) - return zero_init(Affine(output_size, input_size, drop_factor=0.0)) + return Linear(output_size, input_size, init_W=zero_init) def __init__(self, vocab, model=True, **cfg): """Construct a new statistical model. Weights are not allocated on @@ -277,7 +300,6 @@ class Tensorizer(Pipe): self.model = model self.input_models = [] self.cfg = dict(cfg) - self.cfg.setdefault("cnn_maxout_pieces", 3) def __call__(self, example): """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM @@ -337,7 +359,7 @@ class Tensorizer(Pipe): raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) doc.tensor = tensor - def update(self, examples, state=None, drop=0.0, sgd=None, losses=None): + def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. @@ -350,17 +372,23 @@ class Tensorizer(Pipe): examples = Example.to_example_objects(examples) inputs = [] bp_inputs = [] + set_dropout_rate(self.model, drop) for tok2vec in self.input_models: - tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples], drop=drop) + set_dropout_rate(tok2vec, drop) + tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples]) inputs.append(tensor) bp_inputs.append(bp_tensor) inputs = self.model.ops.xp.hstack(inputs) - scores, bp_scores = self.model.begin_update(inputs, drop=drop) + scores, bp_scores = self.model.begin_update(inputs) loss, d_scores = self.get_loss(examples, scores) d_inputs = bp_scores(d_scores, sgd=sgd) d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) for d_input, bp_input in zip(d_inputs, bp_inputs): - bp_input(d_input, sgd=sgd) + bp_input(d_input) + if sgd is not None: + for tok2vec in self.input_models: + tok2vec.finish_update(sgd) + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss @@ -387,6 +415,7 @@ class Tensorizer(Pipe): self.input_models.append(model.tok2vec) if self.model is True: self.model = self.Model(**self.cfg) + self.model.initialize() link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() @@ -405,7 +434,6 @@ class Tagger(Pipe): self.model = model self._rehearsal_model = None self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault("cnn_maxout_pieces", 2) @property def labels(self): @@ -416,12 +444,12 @@ class Tagger(Pipe): if self.model in (None, True, False): return None else: - return chain(self.model.tok2vec, flatten) + return chain(self.model.get_ref("tok2vec"), list2array()) def __call__(self, example): doc = self._get_doc(example) - tags, tokvecs = self.predict([doc]) - self.set_annotations([doc], tags, tensors=tokvecs) + tags = self.predict([doc]) + self.set_annotations([doc], tags) if isinstance(example, Example): example.doc = doc return example @@ -430,8 +458,10 @@ class Tagger(Pipe): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): for examples in util.minibatch(stream, size=batch_size): docs = [self._get_doc(ex) for ex in examples] - tag_ids, tokvecs = self.predict(docs) - self.set_annotations(docs, tag_ids, tensors=tokvecs) + tag_ids = self.predict(docs) + assert len(docs) == len(examples) + assert len(tag_ids) == len(examples) + self.set_annotations(docs, tag_ids) if as_example: annotated_examples = [] @@ -447,20 +477,25 @@ class Tagger(Pipe): if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.labels) - guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] - tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) - return guesses, tokvecs - tokvecs = self.model.tok2vec(docs) - scores = self.model.softmax(tokvecs) + guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] + assert len(guesses) == len(docs) + return guesses + scores = self.model.predict(docs) + assert len(scores) == len(docs), (len(scores), len(docs)) + guesses = self._scores2guesses(scores) + assert len(guesses) == len(docs) + return guesses + + def _scores2guesses(self, scores): guesses = [] for doc_scores in scores: doc_guesses = doc_scores.argmax(axis=1) if not isinstance(doc_guesses, numpy.ndarray): doc_guesses = doc_guesses.get() guesses.append(doc_guesses) - return guesses, tokvecs + return guesses - def set_annotations(self, docs, batch_tag_ids, tensors=None): + def set_annotations(self, docs, batch_tag_ids): if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -483,15 +518,9 @@ class Tagger(Pipe): else: doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] idx += 1 - if tensors is not None and len(tensors): - if isinstance(doc.tensor, numpy.ndarray) \ - and not isinstance(tensors[i], numpy.ndarray): - doc.extend_tensor(tensors[i].get()) - else: - doc.extend_tensor(tensors[i]) doc.is_tagged = True - def update(self, examples, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: @@ -500,13 +529,18 @@ class Tagger(Pipe): if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - - tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + set_dropout_rate(self.model, drop) + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) loss, d_tag_scores = self.get_loss(examples, tag_scores) - bp_tag_scores(d_tag_scores, sgd=sgd) + bp_tag_scores(d_tag_scores) + if sgd not in (None, False): + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss + if set_annotations: + docs = [ex.doc for ex in examples] + self.set_annotations(docs, self._scores2guesses(tag_scores)) def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of @@ -519,10 +553,12 @@ class Tagger(Pipe): if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return - guesses, backprop = self.model.begin_update(docs, drop=drop) + set_dropout_rate(self.model, drop) + guesses, backprop = self.model.begin_update(docs) target = self._rehearsal_model(examples) gradient = guesses - target - backprop(gradient, sgd=sgd) + backprop(gradient) + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() @@ -546,7 +582,7 @@ class Tagger(Pipe): known_labels[idx] = 0. idx += 1 correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() docs = [ex.doc for ex in examples] @@ -566,6 +602,7 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} + cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, @@ -577,16 +614,39 @@ class Tagger(Pipe): if hp in kwargs: self.cfg[hp] = kwargs[hp] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + # Get batch of example docs, example outputs to call begin_training(). + # This lets the model infer shapes. + n_tags = self.vocab.morphology.n_tags + for node in self.model.walk(): + # TODO: softmax hack ? + if node.name == "softmax" and node.has_dim("nO") is None: + node.set_dim("nO", n_tags) link_vectors_to_models(self.vocab) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd @classmethod - def Model(cls, n_tags, **cfg): + def Model(cls, n_tags=None, **cfg): if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"): raise ValueError(TempErrors.T008) - return build_tagger_model(n_tags, **cfg) + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] + else: + config = { + "width": cfg.get("token_vector_width", 96), + "embed_size": cfg.get("embed_size", 2000), + "pretrained_vectors": cfg.get("pretrained_vectors", None), + "window_size": cfg.get("window_size", 1), + "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), + "subword_features": cfg.get("subword_features", True), + "char_embed": cfg.get("char_embed", False), + "conv_depth": cfg.get("conv_depth", 4), + "bilstm_depth": cfg.get("bilstm_depth", 0), + } + tok2vec = Tok2Vec(**config) + return build_tagger_model(n_tags, tok2vec) def add_label(self, label, values=None): if not isinstance(label, str): @@ -633,12 +693,12 @@ class Tagger(Pipe): def load_model(b): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: token_vector_width = util.env_opt( "token_vector_width", self.cfg.get("token_vector_width", 96)) - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + self.model = self.Model(**self.cfg) try: self.model.from_bytes(b) except AttributeError: @@ -676,9 +736,9 @@ class Tagger(Pipe): def load_model(p): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors.name + self.cfg["pretrained_vectors"] = self.vocab.vectors if self.model is True: - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + self.model = self.Model(**self.cfg) with p.open("rb") as file_: try: self.model.from_bytes(file_.read()) @@ -753,10 +813,12 @@ class SentenceRecognizer(Tagger): if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - - tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + set_dropout_rate(self.model, drop) + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) loss, d_tag_scores = self.get_loss(examples, tag_scores) - bp_tag_scores(d_tag_scores, sgd=sgd) + bp_tag_scores(d_tag_scores) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss @@ -780,7 +842,7 @@ class SentenceRecognizer(Tagger): known_labels[idx] = 0. idx += 1 correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() docs = [ex.doc for ex in examples] @@ -797,6 +859,7 @@ class SentenceRecognizer(Tagger): self.model = self.Model(len(self.labels), **self.cfg) if sgd is None: sgd = self.create_optimizer() + self.model.initialize() return sgd @classmethod @@ -918,6 +981,7 @@ class MultitaskObjective(Tagger): token_vector_width = util.env_opt("token_vector_width") self.model = self.Model(len(self.labels), tok2vec=tok2vec) link_vectors_to_models(self.vocab) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd @@ -925,14 +989,12 @@ class MultitaskObjective(Tagger): @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): token_vector_width = util.env_opt("token_vector_width", 96) - softmax = Softmax(n_tags, token_vector_width*2) model = chain( tok2vec, - LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)), - softmax + Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0), + LayerNorm(token_vector_width*2), + Softmax(nO=n_tags, nI=token_vector_width*2) ) - model.tok2vec = tok2vec - model.softmax = softmax return model def predict(self, docs): @@ -958,7 +1020,7 @@ class MultitaskObjective(Tagger): correct[idx] = self.labels[label] idx += 1 correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) loss = (d_scores**2).sum() return float(loss), d_scores @@ -1047,19 +1109,18 @@ class ClozeMultitask(Pipe): def Model(cls, vocab, tok2vec, **cfg): output_size = vocab.vectors.data.shape[1] output_layer = chain( - LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), - zero_init(Affine(output_size, output_size, drop_factor=0.0)) + Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0), + Linear(nO=output_size, nI=output_size, init_W=zero_init) ) model = chain(tok2vec, output_layer) model = masked_language_model(vocab, model) - model.tok2vec = tok2vec - model.output_layer = output_layer return model def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = cfg + self.distance = CosineDistance(ignore_zeros=True, normalize=False) def set_annotations(self, docs, dep_ids, tensors=None): pass @@ -1069,7 +1130,8 @@ class ClozeMultitask(Pipe): link_vectors_to_models(self.vocab) if self.model is True: self.model = self.Model(self.vocab, tok2vec) - X = self.model.ops.allocate((5, self.model.tok2vec.nO)) + X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) + self.model.initialize() self.model.output_layer.begin_training(X) if sgd is None: sgd = self.create_optimizer() @@ -1088,10 +1150,11 @@ class ClozeMultitask(Pipe): # and look them up all at once. This prevents data copying. ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) target = vectors[ids] - loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) - return float(loss), gradient + gradient = self.distance.get_grad(prediction, target) + loss = self.distance.get_loss(prediction, target) + return loss, gradient - def update(self, examples, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): @@ -1099,9 +1162,12 @@ class ClozeMultitask(Pipe): examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. - predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples], drop=drop) + set_dropout_rate(self.model, drop) + predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples]) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) - bp_predictions(d_predictions, sgd=sgd) + bp_predictions(d_predictions) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss @@ -1115,19 +1181,45 @@ class TextCategorizer(Pipe): """ @classmethod - def Model(cls, nr_class=1, **cfg): - embed_size = util.env_opt("embed_size", 2000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] + def Model(cls, nr_class=1, exclusive_classes=None, **cfg): + if nr_class == 1: + exclusive_classes = False + if exclusive_classes is None: + raise ValueError( + "TextCategorizer Model must specify 'exclusive_classes'. " + "This setting determines whether the model will output " + "scores that sum to 1 for each example. If only one class " + "is true for each example, you should set exclusive_classes=True. " + "For 'multi_label' classification, set exclusive_classes=False." + ) + if "embed_size" not in cfg: + cfg["embed_size"] = util.env_opt("embed_size", 2000) + if "token_vector_width" not in cfg: + cfg["token_vector_width"] = util.env_opt("token_vector_width", 96) + if cfg.get("architecture") == "bow": + return build_bow_text_classifier(nr_class, exclusive_classes, **cfg) else: - token_vector_width = util.env_opt("token_vector_width", 96) - if cfg.get("architecture") == "simple_cnn": - tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) - return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) - elif cfg.get("architecture") == "bow": - return build_bow_text_classifier(nr_class, **cfg) - else: - return build_text_classifier(nr_class, **cfg) + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] + else: + config = { + "width": cfg.get("token_vector_width", 96), + "embed_size": cfg.get("embed_size", 2000), + "pretrained_vectors": cfg.get("pretrained_vectors", None), + "window_size": cfg.get("window_size", 1), + "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), + "subword_features": cfg.get("subword_features", True), + "char_embed": cfg.get("char_embed", False), + "conv_depth": cfg.get("conv_depth", 4), + "bilstm_depth": cfg.get("bilstm_depth", 0), + } + tok2vec = Tok2Vec(**config) + return build_simple_cnn_text_classifier( + tok2vec, + nr_class, + exclusive_classes, + **cfg + ) @property def tok2vec(self): @@ -1141,6 +1233,8 @@ class TextCategorizer(Pipe): self.model = model self._rehearsal_model = None self.cfg = dict(cfg) + if "exclusive_classes" not in cfg: + self.cfg["exclusive_classes"] = True @property def labels(self): @@ -1180,7 +1274,7 @@ class TextCategorizer(Pipe): scores = xp.zeros((len(docs), len(self.labels))) return scores, tensors - scores = self.model(docs) + scores = self.model.predict(docs) scores = self.model.ops.asarray(scores) return scores, tensors @@ -1189,18 +1283,24 @@ class TextCategorizer(Pipe): for j, label in enumerate(self.labels): doc.cats[label] = float(scores[i, j]) - def update(self, examples, state=None, drop=0., sgd=None, losses=None): + def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): self.require_model() examples = Example.to_example_objects(examples) if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. return - scores, bp_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update([ex.doc for ex in examples]) loss, d_scores = self.get_loss(examples, scores) - bp_scores(d_scores, sgd=sgd) + bp_scores(d_scores) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss + if set_annotations: + docs = [ex.doc for ex in examples] + self.set_annotations(docs, scores=scores) def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: @@ -1210,10 +1310,13 @@ class TextCategorizer(Pipe): if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return - scores, bp_scores = self.model.begin_update(docs, drop=drop) + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update(docs) target = self._rehearsal_model(examples) gradient = scores - target - bp_scores(gradient, sgd=sgd) + bp_scores(gradient) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() @@ -1247,7 +1350,7 @@ class TextCategorizer(Pipe): # - a huge problem. raise ValueError(Errors.E116) # smaller = self.model._layers[-1] - # larger = Affine(len(self.labels)+1, smaller.nI) + # larger = Linear(len(self.labels)+1, smaller.nI) # copy_array(larger.W[:smaller.nO], smaller.W) # copy_array(larger.b[:smaller.nO], smaller.b) # self.model._layers[-1] = larger @@ -1259,12 +1362,15 @@ class TextCategorizer(Pipe): for cat in example.doc_annotation.cats: self.add_label(cat) if self.model is True: - self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") + self.cfg.update(kwargs) self.require_labels() self.model = self.Model(len(self.labels), **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() + # TODO: use get_examples instead + docs = [Doc(Vocab(), words=["hello"])] + self.model.initialize(X=docs) return sgd @@ -1382,6 +1488,7 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + self.distance = CosineDistance(normalize=False) def set_kb(self, kb): self.kb = kb @@ -1399,16 +1506,14 @@ class EntityLinker(Pipe): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb() self.cfg["entity_width"] = self.kb.entity_vector_length - if self.model is True: self.model = self.Model(**self.cfg) - + self.model.initialize() if sgd is None: sgd = self.create_optimizer() - return sgd - def update(self, examples, state=None, drop=0.0, sgd=None, losses=None): + def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): self.require_model() self.require_kb() if losses is not None: @@ -1416,9 +1521,12 @@ class EntityLinker(Pipe): if not examples: return 0 examples = Example.to_example_objects(examples) - sentence_docs = [] docs = [ex.doc for ex in examples] + if set_annotations: + # This seems simpler than other ways to get that exact output -- but + # it does run the model twice :( + predictions = self.model.predict(docs) golds = [ex.gold for ex in examples] for doc, gold in zip(docs, golds): @@ -1443,13 +1551,17 @@ class EntityLinker(Pipe): except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) - - sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) + set_dropout_rate(self.model, drop) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) - bp_context(d_scores, sgd=sgd) + bp_context(d_scores) + if sgd is not None: + self.model.finish_update(sgd) if losses is not None: losses[self.name] += loss + if set_annotations: + self.set_annotations(docs, predictions) return loss def get_similarity_loss(self, golds, scores): @@ -1467,7 +1579,8 @@ class EntityLinker(Pipe): if scores.shape != entity_encodings.shape: raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) - loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings) + gradients = self.distance.get_grad(scores, entity_encodings) + loss = self.distance.get_loss(scores, entity_encodings) loss = loss / len(entity_encodings) return loss, gradients @@ -1533,7 +1646,7 @@ class EntityLinker(Pipe): for sent in doc.sents: sent_doc = sent.as_doc() # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] + sentence_encoding = self.model.predict([sent_doc])[0] xp = get_array_module(sentence_encoding) sentence_encoding_t = sentence_encoding.T sentence_norm = xp.linalg.norm(sentence_encoding_t) @@ -1720,7 +1833,6 @@ class Sentencizer(Pipe): self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - if as_example: annotated_examples = [] for ex, doc in zip(examples, docs): @@ -1729,7 +1841,7 @@ class Sentencizer(Pipe): yield from annotated_examples else: yield from docs - + def predict(self, docs): """Apply the pipeline's model to a batch of docs, without modifying them. diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py new file mode 100644 index 000000000..9857c87eb --- /dev/null +++ b/spacy/pipeline/tok2vec.py @@ -0,0 +1,188 @@ +from .pipes import Pipe +from ..gold import Example +from ..tokens import Doc +from ..vocab import Vocab +from ..language import component +from ..util import link_vectors_to_models, minibatch, registry, eg2doc + +from thinc.model import Model, set_dropout_rate + + +@component("tok2vec", assigns=["doc.tensor"]) +class Tok2Vec(Pipe): + @classmethod + def from_nlp(cls, nlp, **cfg): + return cls(nlp.vocab, **cfg) + + @classmethod + def Model(cls, architecture, **cfg): + """Create a new statistical model for the class. + + architecture (str): The registered model architecture to use. + **cfg: Config parameters. + RETURNS (Model): A `thinc.model.Model` or similar instance. + """ + model = registry.architectures.get(architecture) + return model(**cfg) + + def __init__(self, vocab, model=True, **cfg): + """Construct a new statistical model. Weights are not allocated on + initialisation. + vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` + instance with the `Doc` objects it will process. + model (Model): A `Model` instance or `True` to allocate one later. + **cfg: Config parameters. + """ + self.vocab = vocab + self.model = model + self.cfg = dict(cfg) + self.listeners = [] + + def create_listener(self): + listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO")) + self.listeners.append(listener) + + def add_listener(self, listener): + self.listeners.append(listener) + + def find_listeners(self, model): + for node in model.walk(): + if isinstance(node, Tok2VecListener) and node.upstream_name == self.name: + self.add_listener(node) + + def __call__(self, doc): + """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM + model. Vectors are set to the `Doc.tensor` attribute. + docs (Doc or iterable): One or more documents to add vectors to. + RETURNS (dict or None): Intermediate computations. + """ + tokvecses = self.predict([doc]) + self.set_annotations([doc], tokvecses) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + """Process `Doc` objects as a stream. + stream (iterator): A sequence of `Doc` objects to process. + batch_size (int): Number of `Doc` objects to group. + n_threads (int): Number of threads. + YIELDS (iterator): A sequence of `Doc` objects, in order of input. + """ + for batch in minibatch(stream, batch_size): + batch = list(batch) + if as_example: + docs = [eg2doc(doc) for doc in batch] + else: + docs = batch + tokvecses = self.predict(docs) + self.set_annotations(docs, tokvecses) + yield from batch + + def predict(self, docs): + """Return a single tensor for a batch of documents. + docs (iterable): A sequence of `Doc` objects. + RETURNS (object): Vector representations for each token in the documents. + """ + tokvecs = self.model.predict(docs) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners: + listener.receive(batch_id, tokvecs, None) + return tokvecs + + def set_annotations(self, docs, tokvecses): + """Set the tensor attribute for a batch of documents. + docs (iterable): A sequence of `Doc` objects. + tokvecs (object): Vector representation for each token in the documents. + """ + for doc, tokvecs in zip(docs, tokvecses): + assert tokvecs.shape[0] == len(doc) + doc.tensor = tokvecs + + def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False): + """Update the model. + examples (iterable): A batch of examples + drop (float): The droput rate. + sgd (callable): An optimizer. + RETURNS (dict): Results from the update. + """ + if losses is None: + losses = {} + examples = Example.to_example_objects(examples) + docs = [eg.doc for eg in examples] + if isinstance(docs, Doc): + docs = [docs] + set_dropout_rate(self.model, drop) + tokvecs, bp_tokvecs = self.model.begin_update(docs) + + def capture_losses(d_tokvecs): + """Accumulate tok2vec loss before doing backprop.""" + l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs) + if self.name in losses: + losses[self.name] += l2_loss / len(d_tokvecs) + else: + losses[self.name] = l2_loss / len(d_tokvecs) + return bp_tokvecs(d_tokvecs) + + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners: + listener.receive(batch_id, tokvecs, capture_losses) + if sgd is not None: + self.model.finish_update(sgd) + if set_annotations: + self.set_annotations(docs, tokvecs) + + def get_loss(self, docs, golds, scores): + pass + + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): + """Allocate models and pre-process training data + + get_examples (function): Function returning example training data. + pipeline (list): The pipeline the model is part of. + """ + if self.model is True: + self.model = self.Model(**self.cfg) + # TODO: use examples instead ? + docs = [Doc(Vocab(), words=["hello"])] + self.model.initialize(X=docs) + link_vectors_to_models(self.vocab) + + +class Tok2VecListener(Model): + """A layer that gets fed its answers from an upstream connection, + for instance from a component earlier in the pipeline. + """ + name = "tok2vec-listener" + + def __init__(self, upstream_name, width): + Model.__init__(self, name=self.name, forward=forward, dims={"nO": width}) + self.upstream_name = upstream_name + self._batch_id = None + self._outputs = None + self._backprop = None + + @classmethod + def get_batch_id(cls, inputs): + return sum(sum(token.orth for token in doc) for doc in inputs) + + def receive(self, batch_id, outputs, backprop): + self._batch_id = batch_id + self._outputs = outputs + self._backprop = backprop + + def verify_inputs(self, inputs): + if self._batch_id is None and self._outputs is None: + raise ValueError + else: + batch_id = self.get_batch_id(inputs) + if batch_id != self._batch_id: + raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}") + else: + return True + + +def forward(model: Tok2VecListener, inputs, is_train): + if is_train: + model.verify_inputs(inputs) + return model._outputs, model._backprop + else: + return [doc.tensor for doc in inputs], lambda dX: [] diff --git a/spacy/syntax/_beam_utils.pxd b/spacy/syntax/_beam_utils.pxd index 36b0c05da..cf99ac3d1 100644 --- a/spacy/syntax/_beam_utils.pxd +++ b/spacy/syntax/_beam_utils.pxd @@ -1,4 +1,4 @@ -from thinc.typedefs cimport class_t, hash_t +from ..typedefs cimport hash_t, class_t # These are passed as callbacks to thinc.search.Beam cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1 diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index b1085c762..32cf9193a 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -5,9 +5,9 @@ import numpy from cpython.ref cimport PyObject, Py_XDECREF from thinc.extra.search cimport Beam from thinc.extra.search import MaxViolation -from thinc.typedefs cimport hash_t, class_t from thinc.extra.search cimport MaxViolation +from ..typedefs cimport hash_t, class_t from .transition_system cimport TransitionSystem, Transition from ..gold cimport GoldParse from ..errors import Errors diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd index 9c72f3415..15befb372 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/syntax/_parser_model.pxd @@ -1,6 +1,6 @@ from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc -from thinc.typedefs cimport weight_t, class_t, hash_t +from ..typedefs cimport weight_t, class_t, hash_t from ._state cimport StateC diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 19d05e77f..cb8e1d127 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -10,18 +10,14 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.extra.search cimport Beam -from thinc.api import chain, clone -from thinc.v2v import Model, Maxout, Affine -from thinc.misc import LayerNorm -from thinc.neural.ops import CupyOps, NumpyOps -from thinc.neural.util import get_array_module -from thinc.linalg cimport Vec, VecVec +from thinc.layers import Linear +from thinc.model import Model +from thinc.backends import CupyOps, NumpyOps, use_ops +from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy -from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten -from .._ml import link_vectors_to_models, create_default_optimizer +from ..typedefs cimport weight_t, class_t, hash_t from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse @@ -31,6 +27,7 @@ from .stateclass cimport StateClass from .transition_system cimport Transition from . import _beam_utils from . import nonproj +from ..util import link_vectors_to_models, create_default_optimizer cdef WeightsC get_c_weights(model) except *: @@ -44,8 +41,8 @@ cdef WeightsC get_c_weights(model) except *: output.hidden_weights = NULL output.hidden_bias = NULL else: - vec2scores_W = model.vec2scores.W - vec2scores_b = model.vec2scores.b + vec2scores_W = model.vec2scores.get_param("W") + vec2scores_b = model.vec2scores.get_param("b") output.hidden_weights = vec2scores_W.data output.hidden_bias = vec2scores_b.data cdef np.ndarray class_mask = model._class_mask @@ -57,12 +54,12 @@ cdef SizesC get_c_sizes(model, int batch_size) except *: cdef SizesC output output.states = batch_size if model.vec2scores is None: - output.classes = model.state2vec.nO + output.classes = model.state2vec.get_dim("nO") else: - output.classes = model.vec2scores.nO - output.hiddens = model.state2vec.nO - output.pieces = model.state2vec.nP - output.feats = model.state2vec.nF + output.classes = model.vec2scores.get_dim("nO") + output.hiddens = model.state2vec.get_dim("nO") + output.pieces = model.state2vec.get_dim("nP") + output.feats = model.state2vec.get_dim("nF") output.embed_width = model.tokvecs.shape[1] return output @@ -226,7 +223,7 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserModel(Model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): - Model.__init__(self) + Model.__init__(self, name="parser_model", forward=forward) self._layers = [tok2vec, lower_model] if upper_model is not None: self._layers.append(upper_model) @@ -235,41 +232,47 @@ class ParserModel(Model): for class_ in unseen_classes: self.unseen_classes.add(class_) - def begin_update(self, docs, drop=0.): - step_model = ParserStepModel(docs, self._layers, drop=drop, - unseen_classes=self.unseen_classes) - def finish_parser_update(golds, sgd=None): - step_model.make_updates(sgd) - return None - return step_model, finish_parser_update + def predict(self, docs): + step_model = ParserStepModel(docs, self._layers, + unseen_classes=self.unseen_classes, train=False) + return step_model - def resize_output(self, new_output): + def resize_output(self, new_nO): if len(self._layers) == 2: return - if new_output == self.upper.nO: + if new_nO == self.upper.get_dim("nO"): return smaller = self.upper - - with Model.use_device('cpu'): - larger = Affine(new_output, smaller.nI) - larger.W.fill(0.0) - larger.b.fill(0.0) - # It seems very unhappy if I pass these as smaller.W? - # Seems to segfault. Maybe it's a descriptor protocol thing? - smaller_W = smaller.W - larger_W = larger.W - smaller_b = smaller.b - larger_b = larger.b + nI = smaller.get_dim("nI") + with use_ops('numpy'): + larger = Linear(new_nO, nI) + larger_W = larger.ops.alloc2f(new_nO, nI) + larger_b = larger.ops.alloc1f(new_nO) + smaller_W = smaller.get_param("W") + smaller_b = smaller.get_param("b") # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. - larger_W[:smaller.nO] = smaller_W - larger_b[:smaller.nO] = smaller_b + larger_W[:smaller.get_dim("nO")] = smaller_W + larger_b[:smaller.get_dim("nO")] = smaller_b + larger.set_param("W", larger_W) + larger.set_param("b", larger_b) self._layers[-1] = larger - for i in range(smaller.nO, new_output): + for i in range(smaller.get_dim("nO"), new_nO): self.unseen_classes.add(i) - def begin_training(self, X, y=None): - self.lower.begin_training(X, y=y) + def initialize(self, X=None, Y=None): + self.tok2vec.initialize() + self.lower.initialize(X=X, Y=Y) + if self.upper is not None: + # In case we need to trigger the callbacks + statevecs = self.ops.alloc((2, self.lower.get_dim("nO"))) + self.upper.initialize(X=statevecs) + + def finish_update(self, optimizer): + self.tok2vec.finish_update(optimizer) + self.lower.finish_update(optimizer) + if self.upper is not None: + self.upper.finish_update(optimizer) @property def tok2vec(self): @@ -284,17 +287,25 @@ class ParserModel(Model): return self._layers[2] +def forward(model:ParserModel, X, is_train): + step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes, + train=is_train) + + return step_model, step_model.finish_steps + + class ParserStepModel(Model): - def __init__(self, docs, layers, unseen_classes=None, drop=0.): - self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop) - if layers[1].nP >= 2: + def __init__(self, docs, layers, unseen_classes=None, train=True): + Model.__init__(self, name="parser_step_model", forward=step_forward) + self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) + if layers[1].get_dim("nP") >= 2: activation = "maxout" elif len(layers) == 2: activation = None else: activation = "relu" self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], - activation=activation, drop=drop) + activation=activation, train=train) if len(layers) == 3: self.vec2scores = layers[-1] else: @@ -304,7 +315,7 @@ class ParserStepModel(Model): if self.vec2scores is None: self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f') else: - self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f') + self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f') self._class_mask.fill(1) if unseen_classes is not None: for class_ in unseen_classes: @@ -323,40 +334,6 @@ class ParserStepModel(Model): def mark_class_seen(self, class_): self._class_mask[class_] = 1 - def begin_update(self, states, drop=0.): - token_ids = self.get_token_ids(states) - vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) - if self.vec2scores is not None: - mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) - if mask is not None: - vector *= mask - scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) - else: - scores = NumpyOps().asarray(vector) - get_d_vector = lambda d_scores, sgd=None: d_scores - mask = None - # If the class is unseen, make sure its score is minimum - scores[:, self._class_mask == 0] = numpy.nanmin(scores) - - def backprop_parser_step(d_scores, sgd=None): - # Zero vectors for unseen classes - d_scores *= self._class_mask - d_vector = get_d_vector(d_scores, sgd=sgd) - if mask is not None: - d_vector *= mask - if isinstance(self.state2vec.ops, CupyOps) \ - and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - self.backprops.append(( - util.get_async(self.cuda_stream, token_ids), - util.get_async(self.cuda_stream, d_vector), - get_d_tokvecs - )) - else: - self.backprops.append((token_ids, d_vector, get_d_tokvecs)) - return None - return scores, backprop_parser_step - def get_token_ids(self, batch): states = _beam_utils.collect_states(batch) cdef StateClass state @@ -370,25 +347,56 @@ class ParserStepModel(Model): c_ids += ids.shape[1] return ids - def make_updates(self, sgd): + def finish_steps(self, golds): # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. - d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) + d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) # Tells CUDA to block, so our async copies complete. if self.cuda_stream is not None: self.cuda_stream.synchronize() for ids, d_vector, bp_vector in self.backprops: - d_state_features = bp_vector((d_vector, ids), sgd=sgd) + d_state_features = bp_vector((d_vector, ids)) ids = ids.flatten() d_state_features = d_state_features.reshape( (ids.size, d_state_features.shape[2])) self.ops.scatter_add(d_tokvecs, ids, d_state_features) # Padded -- see update() - self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd) + if isinstance(self.ops, CupyOps): + d_tokvecs = self.ops.to_numpy(d_tokvecs) + self.bp_tokvecs(d_tokvecs[:-1]) return d_tokvecs +def step_forward(model: ParserStepModel, states, is_train): + token_ids = model.get_token_ids(states) + vector, get_d_tokvecs = model.state2vec(token_ids, is_train) + if model.vec2scores is not None: + scores, get_d_vector = model.vec2scores(vector, is_train) + else: + scores = NumpyOps().asarray(vector) + get_d_vector = lambda d_scores: d_scores + # If the class is unseen, make sure its score is minimum + scores[:, model._class_mask == 0] = numpy.nanmin(scores) + + def backprop_parser_step(d_scores): + # Zero vectors for unseen classes + d_scores *= model._class_mask + d_vector = get_d_vector(d_scores) + if isinstance(model.state2vec.ops, CupyOps) \ + and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): + # Move token_ids and d_vector to GPU, asynchronously + model.backprops.append(( + util.get_async(model.cuda_stream, token_ids), + util.get_async(model.cuda_stream, d_vector), + get_d_tokvecs + )) + else: + model.backprops.append((token_ids, d_vector, get_d_tokvecs)) + return None + return scores, backprop_parser_step + + cdef class precompute_hiddens: """Allow a model to be "primed" by pre-computing input features in bulk. @@ -406,7 +414,7 @@ cdef class precompute_hiddens: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. """ - cdef readonly int nF, nO, nP + cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc cdef bint _is_synchronized cdef public object ops cdef np.ndarray _features @@ -417,8 +425,8 @@ cdef class precompute_hiddens: cdef object activation def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - activation="maxout", drop=0.): - gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) + activation="maxout", train=False): + gpu_cached, bp_features = lower_model(tokvecs, train) cdef np.ndarray cached if not isinstance(gpu_cached, numpy.ndarray): # Note the passing of cuda_stream here: it lets @@ -427,12 +435,16 @@ cdef class precompute_hiddens: cached = gpu_cached.get(stream=cuda_stream) else: cached = gpu_cached - if not isinstance(lower_model.b, numpy.ndarray): - self.bias = lower_model.b.get() + if not isinstance(lower_model.get_param("b"), numpy.ndarray): + # self.bias = lower_model.get_param("b").get(stream=cuda_stream) ??? + self.bias = lower_model.get_param("b") else: - self.bias = lower_model.b + self.bias = lower_model.get_param("b") self.nF = cached.shape[1] - self.nP = getattr(lower_model, 'nP', 1) + if lower_model.has_dim("nP"): + self.nP = lower_model.get_dim("nP") + else: + self.nP = 1 self.nO = cached.shape[2] self.ops = lower_model.ops assert activation in (None, "relu", "maxout") @@ -448,10 +460,26 @@ cdef class precompute_hiddens: self._is_synchronized = True return self._cached.data - def __call__(self, X): - return self.begin_update(X, drop=None)[0] + def get_dim(self, name): + if name == "nF": + return self.nF + elif name == "nP": + return self.nP + elif name == "nO": + return self.nO + else: + raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") - def begin_update(self, token_ids, drop=0.): + def __call__(self, X, bint is_train): + if is_train: + return self.begin_update(X) + else: + return self.predict(X), lambda X: X + + def predict(self, X): + return self.begin_update(X)[0] + + def begin_update(self, token_ids): cdef np.ndarray state_vector = numpy.zeros( (token_ids.shape[0], self.nO, self.nP), dtype='f') # This is tricky, but (assuming GPU available); @@ -466,13 +494,13 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector += self.bias + state_vector = state_vector + self.bias state_vector, bp_nonlinearity = self._nonlinearity(state_vector) - def backward(d_state_vector_ids, sgd=None): + def backward(d_state_vector_ids): d_state_vector, token_ids = d_state_vector_ids - d_state_vector = bp_nonlinearity(d_state_vector, sgd) - d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) + d_state_vector = bp_nonlinearity(d_state_vector) + d_tokens = bp_hiddens((d_state_vector, token_ids)) return d_tokens return state_vector, backward @@ -492,7 +520,7 @@ cdef class precompute_hiddens: else: mask = None - def backprop_nonlinearity(d_best, sgd=None): + def backprop_nonlinearity(d_best): if isinstance(d_best, numpy.ndarray): ops = NumpyOps() else: diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 972ad682a..9e9593eee 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -1,6 +1,6 @@ from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t +from ..typedefs cimport weight_t from .stateclass cimport StateClass from ..typedefs cimport attr_t diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 5dfa20b7d..50b916fe2 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,7 +1,7 @@ -from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam from collections import Counter +from ..typedefs cimport weight_t from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 707c9654c..d77a04420 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -1,5 +1,3 @@ -from thinc.typedefs cimport atom_t - from .stateclass cimport StateClass from .arc_eager cimport TransitionSystem from ..vocab cimport Vocab diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 14d9e54d4..c73bc9a0a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -13,24 +13,23 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.extra.search cimport Beam -from thinc.api import chain, clone -from thinc.v2v import Model, Maxout, Affine -from thinc.misc import LayerNorm -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module -from thinc.linalg cimport Vec, VecVec +from thinc.layers import chain, clone, Linear, list2array +from thinc.backends import NumpyOps, CupyOps, use_ops +from thinc.util import get_array_module +from thinc.backends.linalg cimport Vec, VecVec +from thinc.initializers import zero_init +from thinc.model import set_dropout_rate import srsly from spacy.gold import Example +from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model import ParserModel -from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten -from .._ml import link_vectors_to_models, create_default_optimizer +from ..util import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse @@ -44,6 +43,10 @@ from . import _beam_utils from . import nonproj +from ..ml._layers import PrecomputableAffine +from ..ml.component_models import Tok2Vec + + cdef class Parser: """ Base class of the DependencyParser and EntityRecognizer. @@ -54,7 +57,7 @@ cdef class Parser: subword_features = util.env_opt('subword_features', cfg.get('subword_features', True)) conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4)) - conv_window = util.env_opt('conv_window', cfg.get('conv_depth', 1)) + window_size = util.env_opt('window_size', cfg.get('window_size', 1)) t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) @@ -71,23 +74,23 @@ cdef class Parser: parser_maxout_pieces = 1 embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) pretrained_vectors = cfg.get('pretrained_vectors', None) - tok2vec = Tok2Vec(token_vector_width, embed_size, + tok2vec = Tok2Vec(width=token_vector_width, + embed_size=embed_size, conv_depth=conv_depth, - conv_window=conv_window, + window_size=window_size, cnn_maxout_pieces=t2v_pieces, subword_features=subword_features, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth) - tok2vec = chain(tok2vec, flatten) - tok2vec.nO = token_vector_width + tok2vec = chain(tok2vec, list2array()) + tok2vec.set_dim("nO", token_vector_width) lower = PrecomputableAffine(hidden_width, nF=nr_feature_tokens, nI=token_vector_width, nP=parser_maxout_pieces) - lower.nP = parser_maxout_pieces + lower.set_dim("nP", parser_maxout_pieces) if depth == 1: - with Model.use_device('cpu'): - upper = Affine(nr_class, hidden_width, drop_factor=0.0) - upper.W *= 0 + with use_ops('numpy'): + upper = Linear(nr_class, hidden_width, init_W=zero_init) else: upper = None @@ -102,11 +105,13 @@ cdef class Parser: 'bilstm_depth': bilstm_depth, 'self_attn_depth': self_attn_depth, 'conv_depth': conv_depth, - 'conv_window': conv_window, + 'window_size': window_size, 'embed_size': embed_size, 'cnn_maxout_pieces': t2v_pieces } - return ParserModel(tok2vec, lower, upper), cfg + model = ParserModel(tok2vec, lower, upper) + model.initialize() + return model, cfg name = 'base_parser' @@ -283,12 +288,13 @@ cdef class Parser: def greedy_parse(self, docs, drop=0.): cdef vector[StateC*] states cdef StateClass state + set_dropout_rate(self.model, drop) batch = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() - model = self.model(docs) + model = self.model.predict(docs) weights = get_c_weights(model) for state in batch: if not state.is_final(): @@ -303,18 +309,19 @@ cdef class Parser: cdef Beam beam cdef Doc doc cdef np.ndarray token_ids + set_dropout_rate(self.model, drop) beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() - model = self.model(docs) + model = self.model.predict(docs) token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), dtype='i', order='C') cdef int* c_ids cdef int nr_feature = self.cfg["nr_feature_tokens"] cdef int n_states - model = self.model(docs) + model = self.model.predict(docs) todo = [beam for beam in beams if not beam.is_done] while todo: token_ids.fill(-1) @@ -331,8 +338,8 @@ cdef class Parser: n_states += 1 if n_states == 0: break - vectors = model.state2vec(token_ids[:n_states]) - scores = model.vec2scores(vectors) + vectors = model.state2vec.predict(token_ids[:n_states]) + scores = model.vec2scores.predict(vectors) todo = self.transition_beams(todo, scores) return beams @@ -424,7 +431,7 @@ cdef class Parser: beam.check_done(_beam_utils.check_final_state, NULL) return [b for b in beams if not b.is_done] - def update(self, examples, drop=0., sgd=None, losses=None): + def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): self.require_model() examples = Example.to_example_objects(examples) @@ -438,8 +445,10 @@ cdef class Parser: beam_update_prob = self.cfg.get('beam_update_prob', 0.5) if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob: return self.update_beam(examples, self.cfg.get('beam_width', 1), - drop=drop, sgd=sgd, losses=losses, + drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations, beam_density=self.cfg.get('beam_density', 0.001)) + + set_dropout_rate(self.model, drop) # Chop sequences into lengths of this many transitions, to make the # batch uniform length. cut_gold = numpy.random.choice(range(20, 100)) @@ -448,19 +457,24 @@ cdef class Parser: if not s.is_final() and g is not None] # Prepare the stepwise model, and get the callback for finishing the batch - model, finish_update = self.model.begin_update([ex.doc for ex in examples], drop=drop) + model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples]) + all_states = list(states) for _ in range(max_steps): if not states_golds: break states, golds = zip(*states_golds) - scores, backprop = model.begin_update(states, drop=drop) + scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) - backprop(d_scores, sgd=sgd) + backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) states_golds = [eg for eg in states_golds if not eg[0].is_final()] - # Do the backprop - finish_update(golds, sgd=sgd) + backprop_tok2vec(golds) + if sgd is not None: + self.model.finish_update(sgd) + if set_annotations: + docs = [ex.doc for ex in examples] + self.set_annotations(docs, all_states) return losses def rehearse(self, examples, sgd=None, losses=None, **cfg): @@ -482,13 +496,15 @@ cdef class Parser: # expand our model output. self._resize() # Prepare the stepwise model, and get the callback for finishing the batch - tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0) - model, finish_update = self.model.begin_update(docs, drop=0.0) + set_dropout_rate(self._rehearsal_model, 0.0) + set_dropout_rate(self.model, 0.0) + tutor, _ = self._rehearsal_model.begin_update(docs) + model, finish_update = self.model.begin_update(docs) n_scores = 0. loss = 0. while states: - targets, _ = tutor.begin_update(states, drop=0.) - guesses, backprop = model.begin_update(states, drop=0.) + targets, _ = tutor.begin_update(states) + guesses, backprop = model.begin_update(states) d_scores = (guesses - targets) / targets.shape[0] # If all weights for an output are 0 in the original model, don't # supervise that output. This allows us to add classes. @@ -499,12 +515,14 @@ cdef class Parser: states = [state for state in states if not state.is_final()] n_scores += d_scores.size # Do the backprop - finish_update(docs, sgd=sgd) + finish_update(docs) + if sgd is not None: + self.model.finish_update(sgd) losses[self.name] += loss / n_scores return losses def update_beam(self, examples, width, drop=0., sgd=None, losses=None, - beam_density=0.0): + set_annotations=False, beam_density=0.0): examples = Example.to_example_objects(examples) docs = [ex.doc for ex in examples] golds = [ex.gold for ex in examples] @@ -514,15 +532,16 @@ cdef class Parser: for gold in golds: self.moves.preprocess_gold(gold) new_golds.append(gold) - model, finish_update = self.model.begin_update(docs, drop=drop) + set_dropout_rate(self.model, drop) + model, backprop_tok2vec = self.model.begin_update(docs) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec, - model.vec2scores, width, drop=drop, losses=losses, + self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, + model.state2vec, model.vec2scores, width, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): losses[self.name] += (d_scores**2).mean() ids, bp_vectors, bp_scores = backprops[i] - d_vector = bp_scores(d_scores, sgd=sgd) + d_vector = bp_scores(d_scores) if isinstance(model.ops, CupyOps) \ and not isinstance(ids, model.state2vec.ops.xp.ndarray): model.backprops.append(( @@ -531,11 +550,34 @@ cdef class Parser: bp_vectors)) else: model.backprops.append((ids, d_vector, bp_vectors)) - model.make_updates(sgd) + backprop_tok2vec(golds) + if sgd is not None: + self.model.finish_update(sgd) + if set_annotations: + self.set_annotations(docs, beams) cdef Beam beam for beam in beams: _beam_utils.cleanup_beam(beam) + def get_gradients(self): + """Get non-zero gradients of the model's parameters, as a dictionary + keyed by the parameter ID. The values are (weights, gradients) tuples. + """ + gradients = {} + if self.model in (None, True, False): + return gradients + queue = [self.model] + seen = set() + for node in queue: + if node.id in seen: + continue + seen.add(node.id) + if hasattr(node, "_mem") and node._mem.gradient.any(): + gradients[node.id] = [node._mem.weights, node._mem.gradient] + if hasattr(node, "_layers"): + queue.extend(node._layers) + return gradients + def _init_gold_batch(self, whole_examples, min_length=5, max_length=500): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -605,8 +647,7 @@ cdef class Parser: return d_scores def create_optimizer(self): - return create_default_optimizer(self.model.ops, - **self.cfg.get('optimizer', {})) + return create_default_optimizer() def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg): if 'model' in cfg: @@ -636,14 +677,16 @@ cdef class Parser: for doc, gold in parses: doc_sample.append(doc) gold_sample.append(gold) - self.model.begin_training(doc_sample, gold_sample) + self.model.initialize(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) else: if sgd is None: sgd = self.create_optimizer() - self.model.begin_training([]) + if self.model.upper.has_dim("nO") is None: + self.model.upper.set_dim("nO", self.moves.n_moves) + self.model.initialize() self.cfg.update(cfg) return sgd @@ -709,7 +752,7 @@ cdef class Parser: if 'model' not in exclude: # TODO: Remove this once we don't have to handle previous models if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors.name + self.cfg['pretrained_vectors'] = self.vocab.vectors if self.model is True: self.model, cfg = self.Model(**self.cfg) else: diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index a5fe55918..bd706a997 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -1,7 +1,6 @@ from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t -from ..typedefs cimport attr_t +from ..typedefs cimport attr_t, weight_t from ..structs cimport TokenC from ..gold cimport GoldParse from ..gold cimport GoldParseC diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 62e369091..6ab83436e 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from thinc.typedefs cimport weight_t +from ..typedefs cimport weight_t from thinc.extra.search cimport Beam from collections import Counter import srsly diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index a24fd143d..25892ac71 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,6 +1,6 @@ import pytest -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps +from thinc.optimizers import Adam +from thinc.backends import NumpyOps from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab @@ -28,7 +28,7 @@ def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.begin_training([], **parser.cfg) - sgd = Adam(NumpyOps(), 0.001) + sgd = Adam(0.001, ops=NumpyOps()) for i in range(5): losses = {} @@ -41,8 +41,8 @@ def _train_parser(parser): def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") - sgd = Adam(NumpyOps(), 0.001) - for i in range(10): + sgd = Adam(0.001, ops=NumpyOps()) + for i in range(100): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse( diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8d5043487..8dda1f406 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -7,6 +7,11 @@ from spacy.syntax.ner import BiluoPushDown from spacy.gold import GoldParse from spacy.tokens import Doc +TRAIN_DATA = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), + ] + @pytest.fixture def vocab(): @@ -263,7 +268,7 @@ def test_change_number_features(): nlp.add_pipe(ner) ner.add_label("PERSON") nlp.begin_training() - assert ner.model.lower.nF == ner.nr_feature + assert ner.model.lower.get_dim("nF") == ner.nr_feature # Test we can change it nlp = English() ner = nlp.create_pipe("ner") @@ -272,11 +277,36 @@ def test_change_number_features(): nlp.begin_training( component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}} ) - assert ner.model.lower.nF == 3 + assert ner.model.lower.get_dim("nF") == 3 # Test the model runs nlp("hello world") +def test_overfitting(): + # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly + nlp = English() + ner = nlp.create_pipe("ner") + for _, annotations in TRAIN_DATA: + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + nlp.add_pipe(ner) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.00001 + + # test the trained model + test_text = "I like London." + doc = nlp(test_text) + ents = doc.ents + + assert len(ents) == 1 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 0906fbb94..2470982d3 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,5 +1,5 @@ import pytest -from spacy._ml import Tok2Vec +from spacy.ml.component_models import Tok2Vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser @@ -20,7 +20,9 @@ def arc_eager(vocab): @pytest.fixture def tok2vec(): - return Tok2Vec(8, 100) + tok2vec = Tok2Vec(8, 100) + tok2vec.initialize() + return tok2vec @pytest.fixture @@ -30,7 +32,7 @@ def parser(vocab, arc_eager): @pytest.fixture def model(arc_eager, tok2vec): - return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0] + return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0] @pytest.fixture @@ -53,7 +55,7 @@ def test_build_model(parser): def test_predict_doc(parser, tok2vec, model, doc): - doc.tensor = tok2vec([doc])[0] + doc.tensor = tok2vec.predict([doc])[0] parser.model = model parser(doc) @@ -61,8 +63,9 @@ def test_predict_doc(parser, tok2vec, model, doc): def test_update_doc(parser, model, doc, gold): parser.model = model - def optimize(weights, gradient, key=None): + def optimize(key, weights, gradient): weights -= 0.001 * gradient + return weights, gradient parser.update((doc, gold), sgd=optimize) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 75091ec07..1d3f522c9 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,7 +1,25 @@ import pytest +from spacy.lang.en import English from ..util import get_doc, apply_transition_sequence +TRAIN_DATA = [ + ( + "They trade mortgage-backed securities.", + { + "heads": [1, 1, 4, 4, 5, 1, 1], + "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], + }, + ), + ( + "I like London and Berlin.", + { + "heads": [1, 1, 1, 2, 2, 1], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + }, + ), +] + def test_parser_root(en_tokenizer): text = "i don't have other assistance" @@ -162,3 +180,27 @@ def test_parser_set_sent_starts(en_vocab): for sent in doc.sents: for token in sent: assert token.head in sent + + +def test_overfitting(): + # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly + nlp = English() + parser = nlp.create_pipe("parser") + for _, annotations in TRAIN_DATA: + for dep in annotations.get("deps", []): + parser.add_label(dep) + nlp.add_pipe(parser) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["parser"] < 0.00001 + + # test the trained model + test_text = "I like securities." + doc = nlp(test_text) + + assert doc[0].dep_ is "nsubj" + assert doc[2].dep_ is "dobj" + assert doc[3].dep_ is "punct" diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index ed6aef096..5e56442b5 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,6 +1,6 @@ import pytest -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps +from thinc.optimizers import Adam +from thinc.backends import NumpyOps from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab @@ -21,7 +21,7 @@ def parser(vocab): # parser.add_label('right') parser.add_label("left") parser.begin_training([], **parser.cfg) - sgd = Adam(NumpyOps(), 0.001) + sgd = Adam(0.001) for i in range(10): losses = {} diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index ca9dab009..6a6ec8665 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,4 +1,5 @@ import pytest +import srsly from spacy.language import Language @@ -8,3 +9,35 @@ def test_label_types(): nlp.get_pipe("tagger").add_label("A") with pytest.raises(ValueError): nlp.get_pipe("tagger").add_label(9) + + +TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} + +TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + + +def test_overfitting(): + # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly + nlp = Language() + tagger = nlp.create_pipe("tagger") + for tag, values in TAG_MAP.items(): + tagger.add_label(tag, values) + nlp.add_pipe(tagger) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["tagger"] < 0.00001 + + # test the trained model + test_text = "I like blue eggs" + doc = nlp(test_text) + + assert doc[0].tag_ is "N" + assert doc[1].tag_ is "V" + assert doc[2].tag_ is "J" + assert doc[3].tag_ is "N" diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 9e37e92e1..558d09e40 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -6,6 +6,11 @@ from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.gold import GoldParse +TRAIN_DATA = [ + ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), + ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), +] + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): @@ -67,3 +72,26 @@ def test_label_types(): nlp.get_pipe("textcat").add_label("answer") with pytest.raises(ValueError): nlp.get_pipe("textcat").add_label(9) + + +def test_overfitting(): + # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly + nlp = Language() + textcat = nlp.create_pipe("textcat") + for _, annotations in TRAIN_DATA: + for label, value in annotations.get("cats").items(): + textcat.add_label(label) + nlp.add_pipe(textcat) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["textcat"] < 0.00001 + + # test the trained model + test_text = "I am happy." + doc = nlp(test_text) + cats = doc.cats + assert cats["POSITIVE"] > 0.9 + assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 49e7de179..c4f5e8599 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -8,7 +8,7 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.compat import pickle -from spacy._ml import link_vectors_to_models +from spacy.util import link_vectors_to_models import numpy import random diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index 367961ab1..fca884356 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -32,7 +32,7 @@ def test_issue3611(): # training the network with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]): - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(X=x_train, Y=y_train) for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index ad56e4c54..786e2cedf 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -1,12 +1,12 @@ import pytest from spacy import registry -from thinc.v2v import Affine +from thinc.layers import Linear from catalogue import RegistryError @registry.architectures.register("my_test_function") def create_model(nr_in, nr_out): - return Affine(nr_in, nr_out) + return Linear(nr_in, nr_out) def test_get_architecture(): diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 09e0fb561..a6bcdb50c 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -5,7 +5,8 @@ from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu from spacy.compat import symlink_to, symlink_remove, is_windows -from spacy._ml import PrecomputableAffine +from spacy.ml._layers import PrecomputableAffine +from spacy.ml._layers import _backprop_precomputable_affine_padding from subprocess import CalledProcessError @@ -67,28 +68,30 @@ def test_util_get_package_path(package): def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP) - assert model.W.shape == (nF, nO, nP, nI) - tensor = model.ops.allocate((10, nI)) + assert model.get_param("W").shape == (nF, nO, nP, nI) + tensor = model.ops.alloc((10, nI)) Y, get_dX = model.begin_update(tensor) assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP) - assert model.d_pad.shape == (1, nF, nO, nP) - dY = model.ops.allocate((15, nO, nP)) - ids = model.ops.allocate((15, nF)) + dY = model.ops.alloc((15, nO, nP)) + ids = model.ops.alloc((15, nF)) ids[1, 2] = -1 dY[1] = 1 - assert model.d_pad[0, 2, 0, 0] == 0.0 - model._backprop_padding(dY, ids) - assert model.d_pad[0, 2, 0, 0] == 1.0 - model.d_pad.fill(0.0) + assert not model.has_grad("pad") + d_pad = _backprop_precomputable_affine_padding(model, dY, ids) + assert d_pad[0, 2, 0, 0] == 1.0 ids.fill(0.0) dY.fill(0.0) - ids[1, 2] = -1 + dY[0] = 0 + ids[1, 2] = 0 ids[1, 1] = -1 ids[1, 0] = -1 dY[1] = 1 - assert model.d_pad[0, 2, 0, 0] == 0.0 - model._backprop_padding(dY, ids) - assert model.d_pad[0, 2, 0, 0] == 3.0 + ids[2, 0] = -1 + dY[2] = 5 + d_pad = _backprop_precomputable_affine_padding(model, dY, ids) + assert d_pad[0, 0, 0, 0] == 6 + assert d_pad[0, 1, 0, 0] == 1 + assert d_pad[0, 2, 0, 0] == 0 def test_prefer_gpu(): diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 473d5017d..2d10d79d4 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,6 +1,6 @@ import pytest -from spacy._ml import Tok2Vec +from spacy.ml.component_models import Tok2Vec from spacy.vocab import Vocab from spacy.tokens import Doc @@ -10,7 +10,7 @@ def get_batch(batch_size): docs = [] start = 0 for size in range(1, batch_size + 1): - # Make the words numbers, so that they're distnct + # Make the words numbers, so that they're distinct # across the batch, and easy to track. numbers = [str(i) for i in range(start, start + size)] docs.append(Doc(vocab, words=numbers)) @@ -37,6 +37,7 @@ def test_empty_doc(): def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) tok2vec = Tok2Vec(width, embed_size) + tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) assert len(vectors) == len(batch) for doc_vec, doc in zip(vectors, batch): @@ -56,6 +57,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) tok2vec = Tok2Vec(**tok2vec_config) + tok2vec.initialize() vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"]) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 8684ad018..011cd16b1 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,14 +1,13 @@ import pytest import numpy from numpy.testing import assert_allclose -from spacy._ml import cosine from spacy.vocab import Vocab from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc -from ..util import add_vecs_to_vocab +from ..util import add_vecs_to_vocab, get_cosine @pytest.fixture @@ -311,4 +310,4 @@ def test_vocab_prune_vectors(): assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap - assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) + assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 12690ba50..cd7e5a426 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -4,7 +4,7 @@ from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free from cymem.cymem cimport Pool -from thinc.neural.util import get_array_module +from thinc.util import get_array_module import numpy diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d7348659d..4a18acd77 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,7 +1,7 @@ import numpy import zlib import srsly -from thinc.neural.ops import NumpyOps +from thinc.backends import NumpyOps from ..compat import copy_reg from ..tokens import Doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7e6473d56..15f77d621 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -11,7 +11,7 @@ import numpy import numpy.linalg import struct import srsly -from thinc.neural.util import get_array_module, copy_array +from thinc.util import get_array_module, copy_array from .span cimport Span from .token cimport Token diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9e9322d65..7ab1c1d18 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -3,7 +3,7 @@ from libc.math cimport sqrt import numpy import numpy.linalg -from thinc.neural.util import get_array_module +from thinc.util import get_array_module from collections import defaultdict from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b159fffc1..c241cd5ad 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -7,7 +7,7 @@ cimport numpy as np np.import_array() import numpy -from thinc.neural.util import get_array_module +from thinc.util import get_array_module from ..typedefs cimport hash_t from ..lexeme cimport Lexeme diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index bd5b38958..b43814268 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -2,7 +2,9 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t from libc.stdint cimport uint8_t +ctypedef float weight_t ctypedef uint64_t hash_t +ctypedef uint64_t class_t ctypedef char* utf8_t ctypedef uint64_t attr_t ctypedef uint64_t flags_t diff --git a/spacy/util.py b/spacy/util.py index 55e197eb2..53fa81402 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,8 +4,14 @@ import importlib.util import re from pathlib import Path import random -from thinc.neural._classes.model import Model -from thinc.neural.ops import NumpyOps +from typing import List + +import thinc +import thinc.config +from thinc.backends import NumpyOps, get_current_ops +from thinc.optimizers import Adam +from thinc.util import require_gpu + import functools import itertools import numpy.random @@ -13,6 +19,7 @@ import srsly import catalogue import sys + try: import cupy.random except ImportError: @@ -20,14 +27,13 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream -from .errors import Errors, Warnings, deprecation_warning - +from .errors import Errors, Warnings, deprecation_warning, user_warning _data_path = Path(__file__).parent / "data" _PRINT_ENV = False -class registry(object): +class registry(thinc.registry): languages = catalogue.create("spacy", "languages", entry_points=True) architectures = catalogue.create("spacy", "architectures", entry_points=True) lookups = catalogue.create("spacy", "lookups", entry_points=True) @@ -219,6 +225,23 @@ def load_model_from_init_py(init_file, **overrides): return load_model_from_path(data_path, meta, **overrides) +def load_from_config(path, create_objects=False): + """Load a Thinc-formatted config file, optionally filling in objects where + the config references registry entries. See "Thinc config files" for details. + + path (unicode or Path): Path to the config file + create_objects (bool): Whether to automatically create objects when the config + references registry entries. Defaults to False. + + RETURNS (dict): The objects from the config file. + """ + config = thinc.config.Config().from_disk(path) + if create_objects: + return registry.make_from_config(config, validate=True) + else: + return config + + def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. @@ -293,9 +316,10 @@ def get_component_name(component): def get_cuda_stream(require=False, non_blocking=True): + ops = get_current_ops() if CudaStream is None: return None - elif isinstance(Model.ops, NumpyOps): + elif isinstance(ops, NumpyOps): return None else: return CudaStream(non_blocking=non_blocking) @@ -310,6 +334,14 @@ def get_async(stream, numpy_array): return array +def eg2doc(example): + """Get a Doc object from an Example (or if it's a Doc, use it directly)""" + # Put the import here to avoid circular import problems + from .tokens.doc import Doc + + return example if isinstance(example, Doc) else example.doc + + def env_opt(name, default=None): if type(default) is float: type_convert = float @@ -532,6 +564,8 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len): """Create minibatches of a given number of words.""" if isinstance(size, int): size_ = itertools.repeat(size) + if isinstance(size, List): + size_ = iter(size) else: size_ = size examples = iter(examples) @@ -680,17 +714,7 @@ def escape_html(text): def use_gpu(gpu_id): - try: - import cupy.cuda.device - except ImportError: - return None - from thinc.neural.ops import CupyOps - - device = cupy.cuda.device.Device(gpu_id) - device.use() - Model.ops = CupyOps() - Model.Ops = CupyOps - return device + return require_gpu(gpu_id) def fix_random_seed(seed=0): @@ -747,3 +771,33 @@ class DummyTokenizer(object): def from_disk(self, _path, **kwargs): return self + + +def link_vectors_to_models(vocab): + vectors = vocab.vectors + if vectors.name is None: + vectors.name = VECTORS_KEY + if vectors.data.size != 0: + user_warning(Warnings.W020.format(shape=vectors.data.shape)) + for word in vocab: + if word.orth in vectors.key2row: + word.rank = vectors.key2row[word.orth] + else: + word.rank = 0 + + +VECTORS_KEY = "spacy_pretrained_vectors" + + +def create_default_optimizer(): + ops = get_current_ops() + learn_rate = env_opt("learn_rate", 0.001) + beta1 = env_opt("optimizer_B1", 0.9) + beta2 = env_opt("optimizer_B2", 0.999) + eps = env_opt("optimizer_eps", 1e-8) + L2 = env_opt("L2_penalty", 1e-6) + max_grad_norm = env_opt("grad_norm_clip", 1.0) + optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops) + optimizer.max_grad_norm = max_grad_norm + optimizer.device = ops.device_type + return optimizer diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index b12c8d833..2b1067247 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -5,8 +5,8 @@ from libcpp.set cimport set as cppset import functools import numpy import srsly -from thinc.neural.util import get_array_module -from thinc.neural._classes.model import Model +from thinc.util import get_array_module +from thinc.backends import get_current_ops from .strings cimport StringStore @@ -426,9 +426,9 @@ cdef class Vectors: self.add(key, row=i) def load_vectors(path): - xp = Model.ops.xp + ops = get_current_ops() if path.exists(): - self.data = xp.load(str(path)) + self.data = ops.xp.load(str(path)) serializers = { "key2row": load_key2row, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index c7e74f36c..3da9978c4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -2,7 +2,7 @@ from libc.string cimport memcpy import srsly -from thinc.neural.util import get_array_module +from thinc.util import get_array_module from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -16,7 +16,7 @@ from .errors import Errors from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM from .vectors import Vectors -from ._ml import link_vectors_to_models +from .util import link_vectors_to_models from .lookups import Lookups from . import util From a365359b36e77b6c02cd58c8bf62d91a25ea8052 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Jan 2020 17:44:25 +0100 Subject: [PATCH 032/187] Add convert CLI option to merge CoNLL-U subtokens (#4722) * Add convert CLI option to merge CoNLL-U subtokens Add `-T` option to convert CLI that merges CoNLL-U subtokens into one token in the converted data. Each CoNLL-U sentence is read into a `Doc` and the `Retokenizer` is used to merge subtokens with features as follows: * `orth` is the merged token orth (should correspond to raw text and `# text`) * `tag` is all subtoken tags concatenated with `_`, e.g. `ADP_DET` * `pos` is the POS of the syntactic root of the span (as determined by the Retokenizer) * `morph` is all morphological features merged * `lemma` is all subtoken lemmas concatenated with ` `, e.g. `de o` * with `-m` all morphological features are combined with the tag using the separator `__`, e.g. `ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art` * `dep` is the dependency relation for the syntactic root of the span (as determined by the Retokenizer) Concatenated tags will be mapped to the UD POS of the syntactic root (e.g., `ADP`) and the morphological features will be the combined features. In many cases, the original UD subtokens can be reconstructed from the available features given a language-specific lookup table, e.g., Portuguese `do / ADP_DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` is `de / ADP`, `o / DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` or lookup rules for forms containing open class words like Spanish `hablarlo / VERB_PRON / Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|VerbForm=Inf`. * Clean up imports --- spacy/cli/convert.py | 4 +- spacy/cli/converters/conllu2json.py | 330 ++++++++++++++++++++-------- spacy/tests/test_cli.py | 37 ++++ 3 files changed, 273 insertions(+), 98 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 31931db68..2ffbeb458 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -34,6 +34,7 @@ def convert( seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, + merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, lang: ("Language (if tokenizer required)", "option", "l", str) = None, @@ -85,7 +86,8 @@ def convert( input_data, n_sents=n_sents, seg_sents=seg_sents, - use_morphology=morphology, + append_morphology=morphology, + merge_subtokens=merge_subtokens, lang=lang, model=model, no_print=no_print, diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 12b1103d4..13f2042f9 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -1,36 +1,36 @@ import re -from spacy.gold import Example -from ...gold import iob_to_biluo +from ...gold import Example +from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets +from ...language import Language +from ...tokens import Doc, Token +from .conll_ner2json import n_sents_info +from wasabi import Printer def conllu2json( - input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_ + input_data, n_sents=10, append_morphology=False, lang=None, ner_map=None, + merge_subtokens=False, no_print=False, **_ ): """ Convert conllu files into JSON format for use with train cli. - use_morphology parameter enables appending morphology to tags, which is + append_morphology parameter enables appending morphology to tags, which is useful for languages such as Spanish, where UD tags are not so rich. Extract NER tags if available and convert them so that they follow BILUO and the Wikipedia scheme """ - # by @dvsrepo, via #11 explosion/spacy-dev-resources - # by @katarkor - # name=NER is to handle NorNE MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" + msg = Printer(no_print=no_print) + n_sents_info(msg, n_sents) docs = [] raw = "" sentences = [] - conll_data = read_conllx(input_data, use_morphology=use_morphology) - checked_for_ner = False - has_ner_tags = False + conll_data = read_conllx(input_data, append_morphology=append_morphology, + ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map, + merge_subtokens=merge_subtokens) + has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) for i, example in enumerate(conll_data): - if not checked_for_ner: - has_ner_tags = is_ner( - example.token_annotation.entities[0], MISC_NER_PATTERN - ) - checked_for_ner = True raw += example.text sentences.append( generate_sentence( @@ -43,137 +43,273 @@ def conllu2json( # Real-sized documents could be extracted using the comments on the # conllu document if len(sentences) % n_sents == 0: - doc = create_doc(raw, sentences, i) + doc = create_json_doc(raw, sentences, i) docs.append(doc) raw = "" sentences = [] if sentences: - doc = create_doc(raw, sentences, i) + doc = create_json_doc(raw, sentences, i) docs.append(doc) return docs -def is_ner(tag, tag_pattern): +def has_ner(input_data, ner_tag_pattern): """ Check the 10th column of the first token to determine if the file contains NER tags """ - tag_match = re.search(tag_pattern, tag) - if tag_match: - return True - elif tag == "O": - return True - else: - return False + for sent in input_data.strip().split("\n\n"): + lines = sent.strip().split("\n") + if lines: + while lines[0].startswith("#"): + lines.pop(0) + if lines: + parts = lines[0].split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if re.search(ner_tag_pattern, misc): + return True + else: + return False -def read_conllx(input_data, use_morphology=False, n=0): - """ Yield example data points, one for each sentence """ +def read_conllx(input_data, append_morphology=False, merge_subtokens=False, + ner_tag_pattern="", ner_map=None): + """ Yield examples, one for each sentence """ + vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc i = 0 for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - ids, words, tags, heads, deps, ents = [], [], [], [], [], [] - spaces = [] - for line in lines: - parts = line.split("\t") - id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts - if "-" in id_ or "." in id_: - continue - try: - id_ = int(id_) - 1 - head = (int(head) - 1) if head != "0" else id_ - dep = "ROOT" if dep == "root" else dep - tag = pos if tag == "_" else tag - tag = tag + "__" + morph if use_morphology else tag - ent = misc if misc else "O" - - ids.append(id_) - words.append(word) - tags.append(tag) - heads.append(head) - deps.append(dep) - ents.append(ent) - if "SpaceAfter=No" in misc: - spaces.append(False) - else: - spaces.append(True) - except: # noqa: E722 - print(line) - raise - raw = "" - for word, space in zip(words, spaces): - raw += word - if space: - raw += " " - example = Example(doc=raw) - example.set_token_annotation( - ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents - ) + example = example_from_conllu_sentence(vocab, lines, + ner_tag_pattern, merge_subtokens=merge_subtokens, + append_morphology=append_morphology, + ner_map=ner_map) yield example - i += 1 - if 1 <= n <= i: - break -def extract_tags(iob, tag_pattern, ner_map=None): +def get_entities(lines, tag_pattern, ner_map=None): + """Find entities in the MISC column according to the pattern and map to + final entity type with `ner_map` if mapping present. Entity tag is 'O' if + the pattern is not matched. + + lines (unicode): CONLL-U lines for one sentences + tag_pattern (unicode): Regex pattern for entity tag + ner_map (dict): Map old NER tag names to new ones, '' maps to O. + RETURNS (list): List of BILUO entity tags """ - Extract tag from MISC column according to `tag_pattern` and map to final - entity type with `ner_map` if mapping present. + miscs = [] + for line in lines: + parts = line.split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if "-" in id_ or "." in id_: + continue + miscs.append(misc) - For NorNE: - Simplify tags obtained from the dataset in order to follow Wikipedia - scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while - 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to - 'MISC'. - """ - new_iob = [] - for tag in iob: - tag_match = re.search(tag_pattern, tag) - new_tag = "O" + iob = [] + for misc in miscs: + tag_match = re.search(tag_pattern, misc) + iob_tag = "O" if tag_match: prefix = tag_match.group(2) suffix = tag_match.group(3) if prefix and suffix: - new_tag = prefix + "-" + suffix + iob_tag = prefix + "-" + suffix if ner_map: suffix = ner_map.get(suffix, suffix) if suffix == "": - new_tag = "O" + iob_tag = "O" else: - new_tag = prefix + "-" + suffix - new_iob.append(new_tag) - return new_iob + iob_tag = prefix + "-" + suffix + iob.append(iob_tag) + return iob_to_biluo(iob) def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None): sentence = {} tokens = [] - if has_ner_tags: - iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map) - biluo = iob_to_biluo(iob) - for i, id in enumerate(token_annotation.ids): + for i, id_ in enumerate(token_annotation.ids): token = {} - token["id"] = id - token["orth"] = token_annotation.words[i] - token["tag"] = token_annotation.tags[i] - token["head"] = token_annotation.heads[i] - id - token["dep"] = token_annotation.deps[i] + token["id"] = id_ + token["orth"] = token_annotation.get_word(i) + token["tag"] = token_annotation.get_tag(i) + token["pos"] = token_annotation.get_pos(i) + token["lemma"] = token_annotation.get_lemma(i) + token["morph"] = token_annotation.get_morph(i) + token["head"] = token_annotation.get_head(i) - id_ + token["dep"] = token_annotation.get_dep(i) if has_ner_tags: - token["ner"] = biluo[i] + token["ner"] = token_annotation.get_entity(i) tokens.append(token) sentence["tokens"] = tokens return sentence -def create_doc(raw, sentences, id): +def create_json_doc(raw, sentences, id_): doc = {} paragraph = {} - doc["id"] = id + doc["id"] = id_ doc["paragraphs"] = [] paragraph["raw"] = raw.strip() paragraph["sentences"] = sentences doc["paragraphs"].append(paragraph) return doc + + +def example_from_conllu_sentence(vocab, lines, ner_tag_pattern, + merge_subtokens=False, append_morphology=False, ner_map=None): + """Create an Example from the lines for one CoNLL-U sentence, merging + subtokens and appending morphology to tags if required. + + lines (unicode): The non-comment lines for a CoNLL-U sentence + ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col + RETURNS (Example): An example containing the annotation + """ + # create a Doc with each subtoken as its own token + # if merging subtokens, each subtoken orth is the merged subtoken form + if not Token.has_extension("merged_orth"): + Token.set_extension("merged_orth", default="") + if not Token.has_extension("merged_lemma"): + Token.set_extension("merged_lemma", default="") + if not Token.has_extension("merged_morph"): + Token.set_extension("merged_morph", default="") + if not Token.has_extension("merged_spaceafter"): + Token.set_extension("merged_spaceafter", default="") + words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], [] + heads, deps = [], [] + subtok_word = "" + in_subtok = False + for i in range(len(lines)): + line = lines[i] + subtok_lines = [] + parts = line.split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if "." in id_: + continue + if "-" in id_: + in_subtok = True + if "-" in id_: + in_subtok = True + subtok_word = word + subtok_start, subtok_end = id_.split("-") + subtok_spaceafter = "SpaceAfter=No" not in misc + continue + if merge_subtokens and in_subtok: + words.append(subtok_word) + else: + words.append(word) + if in_subtok: + if id_ == subtok_end: + spaces.append(subtok_spaceafter) + else: + spaces.append(False) + elif "SpaceAfter=No" in misc: + spaces.append(False) + else: + spaces.append(True) + if in_subtok and id_ == subtok_end: + subtok_word = "" + in_subtok = False + id_ = int(id_) - 1 + head = (int(head) - 1) if head != "0" else id_ + tag = pos if tag == "_" else tag + morph = morph if morph != "_" else "" + dep = "ROOT" if dep == "root" else dep + lemmas.append(lemma) + poses.append(pos) + tags.append(tag) + morphs.append(morph) + heads.append(head) + deps.append(dep) + + doc = Doc(vocab, words=words, spaces=spaces) + for i in range(len(doc)): + doc[i].tag_ = tags[i] + doc[i].pos_ = poses[i] + doc[i].dep_ = deps[i] + doc[i].lemma_ = lemmas[i] + doc[i].head = doc[heads[i]] + doc[i]._.merged_orth = words[i] + doc[i]._.merged_morph = morphs[i] + doc[i]._.merged_lemma = lemmas[i] + doc[i]._.merged_spaceafter = spaces[i] + ents = get_entities(lines, ner_tag_pattern, ner_map) + doc.ents = spans_from_biluo_tags(doc, ents) + doc.is_parsed = True + doc.is_tagged = True + + if merge_subtokens: + doc = merge_conllu_subtokens(lines, doc) + + # create Example from custom Doc annotation + ids, words, tags, heads, deps = [], [], [], [], [] + pos, lemmas, morphs, spaces = [], [], [], [] + for i, t in enumerate(doc): + ids.append(i) + words.append(t._.merged_orth) + if append_morphology and t._.merged_morph: + tags.append(t.tag_ + "__" + t._.merged_morph) + else: + tags.append(t.tag_) + pos.append(t.pos_) + morphs.append(t._.merged_morph) + lemmas.append(t._.merged_lemma) + heads.append(t.head.i) + deps.append(t.dep_) + spaces.append(t._.merged_spaceafter) + ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + ents = biluo_tags_from_offsets(doc, ent_offsets) + raw = "" + for word, space in zip(words, spaces): + raw += word + if space: + raw += " " + example = Example(doc=raw) + example.set_token_annotation(ids=ids, words=words, tags=tags, pos=pos, + morphs=morphs, lemmas=lemmas, heads=heads, + deps=deps, entities=ents) + return example + + +def merge_conllu_subtokens(lines, doc): + # identify and process all subtoken spans to prepare attrs for merging + subtok_spans = [] + for line in lines: + parts = line.split("\t") + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts + if "-" in id_: + subtok_start, subtok_end = id_.split("-") + subtok_span = doc[int(subtok_start) - 1:int(subtok_end)] + subtok_spans.append(subtok_span) + # create merged tag, morph, and lemma values + tags = [] + morphs = {} + lemmas = [] + for token in subtok_span: + tags.append(token.tag_) + lemmas.append(token.lemma_) + if token._.merged_morph: + for feature in token._.merged_morph.split("|"): + field, values = feature.split("=", 1) + if not field in morphs: + morphs[field] = set() + for value in values.split(","): + morphs[field].add(value) + # create merged features for each morph field + for field, values in morphs.items(): + morphs[field] = field + "=" + ",".join(sorted(values)) + # set the same attrs on all subtok tokens so that whatever head the + # retokenizer chooses, the final attrs are available on that token + for token in subtok_span: + token._.merged_orth = token.orth_ + token._.merged_lemma = " ".join(lemmas) + token.tag_ = "_".join(tags) + token._.merged_morph = "|".join(sorted(morphs.values())) + token._.merged_spaceafter = True if subtok_span[-1].whitespace_ else False + + with doc.retokenize() as retokenizer: + for span in subtok_spans: + retokenizer.merge(span) + + return doc diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index b4aebe521..049858960 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -54,6 +54,43 @@ def test_cli_converters_conllu2json_name_ner_map(): assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] +def test_cli_converters_conllu2json_subtokens(): + # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + lines = [ + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER", + "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", + ] + input_data = "\n".join(lines) + converted = conllu2json(input_data, n_sents=1, merge_subtokens=True, + append_morphology=True) + assert len(converted) == 1 + assert converted[0]["id"] == 0 + assert len(converted[0]["paragraphs"]) == 1 + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." + assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 + sent = converted[0]["paragraphs"][0]["sentences"][0] + assert len(sent["tokens"]) == 4 + tokens = sent["tokens"] + print(tokens) + assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."] + assert [t["tag"] for t in tokens] == [ + "NOUN__Definite=Ind|Gender=Masc|Number=Sing", + "PROPN_X__Gender=Fem,Masc|Tense=past", + "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin", + "PUNCT" + ] + assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT'] + assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', ''] + assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.'] + assert [t["head"] for t in tokens] == [1, 1, 0, -1] + assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] + assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] + + def test_cli_converters_iob2json(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", From 5ee9d8c9b80c9e80491f320e58c0d86d2ec917b7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Jan 2020 17:45:46 +0100 Subject: [PATCH 033/187] Add MORPH attr, add support in retokenizer (#4947) * Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH --- spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + spacy/tests/doc/test_retokenize_merge.py | 4 +++- spacy/tests/doc/test_retokenize_split.py | 3 +++ spacy/tokens/_retokenize.pyx | 7 ++++++- spacy/tokens/token.pxd | 4 ++++ 8 files changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d9aca078c..7fc0b9111 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -91,3 +91,4 @@ cdef enum attr_id_t: LANG ENT_KB_ID = symbols.ENT_KB_ID + MORPH diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index a601a7a66..97ca627fb 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -87,6 +87,7 @@ IDS = { "SPACY": SPACY, "PROB": PROB, "LANG": LANG, + "MORPH": MORPH, } diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b95b4b805..5c1970628 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -462,3 +462,4 @@ cdef enum symbol_t: acl ENT_KB_ID + MORPH diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 36b9ffa67..128946ec7 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -462,6 +462,7 @@ IDS = { "acl": acl, "LAW": LAW, + "MORPH": MORPH, } diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index c82c04eeb..17bcd2c64 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -8,7 +8,7 @@ from ..util import get_doc def test_doc_retokenize_merge(en_tokenizer): text = "WKRO played songs by the beach boys all night" - attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"} doc = en_tokenizer(text) assert len(doc) == 9 with doc.retokenize() as retokenizer: @@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer): assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " assert doc[4].tag_ == "NAMED" + assert doc[4].morph_ == "Number=Plur" assert doc[5].text == "all night" assert doc[5].text_with_ws == "all night" assert doc[5].tag_ == "NAMED" + assert doc[5].morph_ == "Number=Plur" def test_doc_retokenize_merge_children(en_tokenizer): diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 33b6fbe81..5f40da425 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab): "tag": ["NNP"] * 2, "lemma": ["Los", "Angeles"], "ent_type": ["GPE"] * 2, + "morph": ["Number=Sing"] * 2, }, ) assert len(doc) == 4 assert doc[0].text == "Los" assert doc[0].head.text == "Angeles" assert doc[0].idx == 0 + assert doc[0].morph_ == "Number=Sing" assert doc[1].idx == 3 assert doc[1].text == "Angeles" assert doc[1].head.text == "start" + assert doc[1].morph_ == "Number=Sing" assert doc[2].text == "start" assert doc[2].head.text == "." assert doc[3].text == "." diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index cd7e5a426..ec7e8a9e8 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -13,7 +13,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport TAG +from ..attrs cimport TAG, MORPH from .underscore import is_writable_attr from ..attrs import intify_attrs @@ -65,6 +65,8 @@ cdef class Retokenizer: attrs["_"] = extensions else: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if MORPH in attrs: + self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH])) self.merges.append((span, attrs)) def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): @@ -96,6 +98,9 @@ cdef class Retokenizer: # NB: Since we support {"KEY": [value, value]} syntax here, this # will only "intify" the keys, not the values attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if MORPH in attrs: + for morph in attrs[MORPH]: + self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph)) head_offsets = [] for head in heads: if isinstance(head, Token): diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index ec5df3fac..82d9c7c2a 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -43,6 +43,8 @@ cdef class Token: return token.pos elif feat_name == TAG: return token.tag + elif feat_name == MORPH: + return token.morph elif feat_name == DEP: return token.dep elif feat_name == HEAD: @@ -71,6 +73,8 @@ cdef class Token: token.pos = value elif feat_name == TAG: token.tag = value + elif feat_name == MORPH: + token.morph = value elif feat_name == DEP: token.dep = value elif feat_name == HEAD: From ccef9f2f446a9c6b0a212db5a7e9f7bfb93b16b4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jan 2020 17:52:22 +0100 Subject: [PATCH 034/187] Update version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index a1880fb54..356e12269 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.3" +__version__ = "3.0.0.dev0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 0c5c8c37eef77c2576a0243d36986f7b61069f4e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jan 2020 10:26:03 +0100 Subject: [PATCH 035/187] Depend on tqdm --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 9ea85e896..a0f88de47 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,7 @@ install_requires = plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 + tqdm>=4.38.0,<5.0.0 [options.extras_require] lookups = From ba6d78132d9538346620c6b6cb384daa28fc0388 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jan 2020 10:35:09 +0100 Subject: [PATCH 036/187] Fix dev version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 356e12269..6e01a855a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev0" +__version__ = "3.0.0.dev2" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 9df0b1360df3d58d26d98ddd54ef910911faaec4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jan 2020 10:35:18 +0100 Subject: [PATCH 037/187] Fix ml_datasets --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index a0f88de47..a3aede089 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,6 +46,7 @@ install_requires = wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 + ml_datasets # Third-party dependencies setuptools numpy>=1.15.0 From 71b93f33bb450198457844fabdc4445a76e5ecfe Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jan 2020 15:41:45 +0100 Subject: [PATCH 038/187] Set dev version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 6e01a855a..6a3c680ab 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev2" +__version__ = "3.0.0.dev3" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From cabd60fa1e87e95398b60a2a2246a45711a7ffee Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 3 Feb 2020 13:02:12 +0100 Subject: [PATCH 039/187] Small fixes to as_example (#4957) * label in span not writable anymore * Revert "label in span not writable anymore" This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090. * fixing yield - remove redundant list --- spacy/language.py | 13 ++++++------- spacy/pipeline/pipes.pyx | 24 ++++++------------------ 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index cde9c0164..a2baa5922 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -534,7 +534,9 @@ class Language(object): if not hasattr(proc, "rehearse"): continue grads = {} - proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {})) + proc.rehearse( + examples, sgd=get_grads, losses=losses, **config.get(name, {}) + ) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) return losses @@ -590,10 +592,7 @@ class Language(object): kwargs = component_cfg.get(name, {}) kwargs.update(cfg) proc.begin_training( - get_examples, - pipeline=self.pipeline, - sgd=self._optimizer, - **kwargs + get_examples, pipeline=self.pipeline, sgd=self._optimizer, **kwargs ) self._link_components() return self._optimizer @@ -701,7 +700,7 @@ class Language(object): cleanup=False, component_cfg=None, n_process=1, - as_example=False + as_example=False, ): """Process texts as a stream, and yield `Doc` objects in order. @@ -737,7 +736,7 @@ class Language(object): disable=disable, n_process=n_process, component_cfg=component_cfg, - as_example=False # TODO: shouldn't this be as_example=as_example ? + as_example=as_example, ) for doc, context in zip(docs, contexts): yield (doc, context) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index bca53bc03..ca39de959 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -108,11 +108,9 @@ class Pipe(object): self.set_annotations(docs, predictions) if as_example: - annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples + yield ex else: yield from docs @@ -329,11 +327,9 @@ class Tensorizer(Pipe): self.set_annotations(docs, tensors) if as_example: - annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples + yield ex else: yield from docs @@ -464,11 +460,9 @@ class Tagger(Pipe): self.set_annotations(docs, tag_ids) if as_example: - annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples + yield ex else: yield from docs @@ -1256,11 +1250,9 @@ class TextCategorizer(Pipe): self.set_annotations(docs, scores, tensors=tensors) if as_example: - annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples + yield ex else: yield from docs @@ -1616,11 +1608,9 @@ class EntityLinker(Pipe): self.set_annotations(docs, kb_ids, tensors=tensors) if as_example: - annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples + yield ex else: yield from docs @@ -1834,11 +1824,9 @@ class Sentencizer(Pipe): else: self.set_annotations(docs, predictions) if as_example: - annotated_examples = [] for ex, doc in zip(examples, docs): ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples + yield ex else: yield from docs From 781e95cf536cd5720d07ec80b2cb89eaa4b41290 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 11 Feb 2020 02:31:49 +0100 Subject: [PATCH 040/187] Ensure doc.similarity returns a float (on develop) (#4969) --- spacy/tokens/doc.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 15f77d621..aec06d620 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -391,7 +391,9 @@ cdef class Doc: return 0.0 vector = self.vector xp = get_array_module(vector) - return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() @property def has_vector(self): From 9b84f987bdca50891b293a65762a00145307a3af Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 11 Feb 2020 02:33:16 +0100 Subject: [PATCH 041/187] fix grad_clip naming (#4967) --- spacy/util.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 53fa81402..0cc11cef7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -796,8 +796,6 @@ def create_default_optimizer(): beta2 = env_opt("optimizer_B2", 0.999) eps = env_opt("optimizer_eps", 1e-8) L2 = env_opt("L2_penalty", 1e-6) - max_grad_norm = env_opt("grad_norm_clip", 1.0) - optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops) - optimizer.max_grad_norm = max_grad_norm - optimizer.device = ops.device_type + grad_clip = env_opt("grad_norm_clip", 1.0) + optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip) return optimizer From 2ed49404e30f206894e8c25fb28f8135d0a69077 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 11 Feb 2020 17:46:18 -0500 Subject: [PATCH 042/187] Improve setup.py and call into Cython directly (#4952) * Improve setup.py and call into Cython directly * Add numpy to setup_requires * Improve clean helper * Update setup.cfg * Try if it builds without pyproject.toml * Update MANIFEST.in --- MANIFEST.in | 2 +- bin/cythonize.py | 169 ------------------------------------------ pyproject.toml | 3 - setup.cfg | 1 + setup.py | 165 ++++++++++++++++------------------------- spacy/tokenizer.pyx | 2 + spacy/tokens/span.pyx | 1 + 7 files changed, 67 insertions(+), 276 deletions(-) delete mode 100755 bin/cythonize.py delete mode 100644 pyproject.toml diff --git a/MANIFEST.in b/MANIFEST.in index 78655a5f4..266af1b0a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include include *.h -recursive-include spacy *.txt +recursive-include spacy *.pyx *.pxd *.txt include LICENSE include README.md include bin/spacy diff --git a/bin/cythonize.py b/bin/cythonize.py deleted file mode 100755 index 554252294..000000000 --- a/bin/cythonize.py +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env python -""" cythonize.py - -Cythonize pyx files into C++ files as needed. - -Usage: cythonize.py [root] - -Checks pyx files to see if they have been changed relative to their -corresponding C++ files. If they have, then runs cython on these files to -recreate the C++ files. - -Additionally, checks pxd files and setup.py if they have been changed. If -they have, rebuilds everything. - -Change detection based on file hashes stored in JSON format. - -For now, this script should be run by developers when changing Cython files -and the resulting C++ files checked in, so that end-users (and Python-only -developers) do not get the Cython dependencies. - -Based upon: - -https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py -https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py - -Note: this script does not check any of the dependent C++ libraries. -""" -from __future__ import print_function - -import os -import sys -import json -import hashlib -import subprocess -import argparse - - -HASH_FILE = "cythonize.json" - - -def process_pyx(fromfile, tofile, language_level="-3"): - print("Processing %s" % fromfile) - try: - from Cython.Compiler.Version import version as cython_version - from distutils.version import LooseVersion - - if LooseVersion(cython_version) < LooseVersion("0.25"): - raise Exception("Require Cython >= 0.25") - - except ImportError: - pass - - flags = ["--fast-fail", language_level] - if tofile.endswith(".cpp"): - flags += ["--cplus"] - - try: - try: - r = subprocess.call( - ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ - ) # See Issue #791 - if r != 0: - raise Exception("Cython failed") - except OSError: - # There are ways of installing Cython that don't result in a cython - # executable on the path, see gh-2397. - r = subprocess.call( - [ - sys.executable, - "-c", - "import sys; from Cython.Compiler.Main import " - "setuptools_main as main; sys.exit(main())", - ] - + flags - + ["-o", tofile, fromfile] - ) - if r != 0: - raise Exception("Cython failed") - except OSError: - raise OSError("Cython needs to be installed") - - -def preserve_cwd(path, func, *args): - orig_cwd = os.getcwd() - try: - os.chdir(path) - func(*args) - finally: - os.chdir(orig_cwd) - - -def load_hashes(filename): - try: - return json.load(open(filename)) - except (ValueError, IOError): - return {} - - -def save_hashes(hash_db, filename): - with open(filename, "w") as f: - f.write(json.dumps(hash_db)) - - -def get_hash(path): - return hashlib.md5(open(path, "rb").read()).hexdigest() - - -def hash_changed(base, path, db): - full_path = os.path.normpath(os.path.join(base, path)) - return not get_hash(full_path) == db.get(full_path) - - -def hash_add(base, path, db): - full_path = os.path.normpath(os.path.join(base, path)) - db[full_path] = get_hash(full_path) - - -def process(base, filename, db): - root, ext = os.path.splitext(filename) - if ext in [".pyx", ".cpp"]: - if hash_changed(base, filename, db) or not os.path.isfile( - os.path.join(base, root + ".cpp") - ): - preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp") - hash_add(base, root + ".cpp", db) - hash_add(base, root + ".pyx", db) - - -def check_changes(root, db): - res = False - new_db = {} - - setup_filename = "setup.py" - hash_add(".", setup_filename, new_db) - if hash_changed(".", setup_filename, db): - res = True - - for base, _, files in os.walk(root): - for filename in files: - if filename.endswith(".pxd"): - hash_add(base, filename, new_db) - if hash_changed(base, filename, db): - res = True - - if res: - db.clear() - db.update(new_db) - return res - - -def run(root): - db = load_hashes(HASH_FILE) - - try: - check_changes(root, db) - for base, _, files in os.walk(root): - for filename in files: - process(base, filename, db) - finally: - save_hashes(db, HASH_FILE) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Cythonize pyx files into C++ files as needed" - ) - parser.add_argument("root", help="root directory") - args = parser.parse_args() - run(args.root) diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index fed528d4a..000000000 --- a/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["setuptools"] -build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index a3aede089..f360cac37 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,7 @@ python_requires = >=3.6 setup_requires = wheel cython>=0.25 + numpy>=1.15.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/setup.py b/setup.py index 1afdc7ae4..31f22ba3f 100755 --- a/setup.py +++ b/setup.py @@ -1,34 +1,22 @@ #!/usr/bin/env python -import io -import os -import subprocess import sys -import contextlib from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc import distutils.util from distutils import ccompiler, msvccompiler from setuptools import Extension, setup, find_packages +import numpy +from pathlib import Path +from Cython.Build import cythonize +from Cython.Compiler import Options -def is_new_osx(): - """Check whether we're on OSX >= 10.10""" - name = distutils.util.get_platform() - if sys.platform != "darwin": - return False - elif name.startswith("macosx-10"): - minor_version = int(name.split("-")[1].split(".")[1]) - if minor_version >= 7: - return True - else: - return False - else: - return False +# Preserve `__doc__` on functions and classes +# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options +Options.docstrings = True PACKAGES = find_packages() - - MOD_NAMES = [ "spacy.parts_of_speech", "spacy.strings", @@ -61,16 +49,32 @@ MOD_NAMES = [ "spacy.symbols", "spacy.vectors", ] - - COMPILE_OPTIONS = { "msvc": ["/Ox", "/EHsc"], "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], } - - LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []} +COMPILER_DIRECTIVES = { + "language_level": -3, + "embedsignature": True, + "annotation_typing": False, +} + + +def is_new_osx(): + """Check whether we're on OSX >= 10.10""" + name = distutils.util.get_platform() + if sys.platform != "darwin": + return False + elif name.startswith("macosx-10"): + minor_version = int(name.split("-")[1].split(".")[1]) + if minor_version >= 7: + return True + else: + return False + else: + return False if is_new_osx(): @@ -103,95 +107,50 @@ class build_ext_subclass(build_ext, build_ext_options): build_ext.build_extensions(self) -def generate_cython(root, source): - print("Cythonizing sources") - p = subprocess.call( - [sys.executable, os.path.join(root, "bin", "cythonize.py"), source], - env=os.environ, - ) - if p != 0: - raise RuntimeError("Running cythonize failed") - - -def is_source_release(path): - return os.path.exists(os.path.join(path, "PKG-INFO")) - - def clean(path): - for name in MOD_NAMES: - name = name.replace(".", "/") - for ext in [".so", ".html", ".cpp", ".c"]: - file_path = os.path.join(path, name + ext) - if os.path.exists(file_path): - os.unlink(file_path) - - -@contextlib.contextmanager -def chdir(new_dir): - old_dir = os.getcwd() - try: - os.chdir(new_dir) - sys.path.insert(0, new_dir) - yield - finally: - del sys.path[0] - os.chdir(old_dir) + for path in path.glob("**/*"): + if path.is_file() and path.suffix in (".so", ".cpp"): + print(f"Deleting {path.name}") + path.unlink() def setup_package(): - root = os.path.abspath(os.path.dirname(__file__)) + root = Path(__file__).parent if len(sys.argv) > 1 and sys.argv[1] == "clean": - return clean(root) + return clean(root / "spacy") - with chdir(root): - with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f: - about = {} - exec(f.read(), about) + with (root / "spacy" / "about.py").open("r") as f: + about = {} + exec(f.read(), about) - include_dirs = [ - get_python_inc(plat_specific=True), - os.path.join(root, "include"), - ] + include_dirs = [ + get_python_inc(plat_specific=True), + numpy.get_include(), + str(root / "include"), + ] + if ( + ccompiler.new_compiler().compiler_type == "msvc" + and msvccompiler.get_build_version() == 9 + ): + include_dirs.append(str(root / "include" / "msvc9")) + ext_modules = [] + for name in MOD_NAMES: + mod_path = name.replace(".", "/") + ".pyx" + ext = Extension(name, [mod_path], language="c++") + ext_modules.append(ext) + print("Cythonizing sources") + ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES) - if ( - ccompiler.new_compiler().compiler_type == "msvc" - and msvccompiler.get_build_version() == 9 - ): - include_dirs.append(os.path.join(root, "include", "msvc9")) - - ext_modules = [] - for mod_name in MOD_NAMES: - mod_path = mod_name.replace(".", "/") + ".cpp" - extra_link_args = [] - # ??? - # Imported from patch from @mikepb - # See Issue #267. Running blind here... - if sys.platform == "darwin": - dylib_path = [".." for _ in range(mod_name.count("."))] - dylib_path = "/".join(dylib_path) - dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path - extra_link_args.append("-Wl,-rpath,%s" % dylib_path) - ext_modules.append( - Extension( - mod_name, - [mod_path], - language="c++", - include_dirs=include_dirs, - extra_link_args=extra_link_args, - ) - ) - - if not is_source_release(root): - generate_cython(root, "spacy") - - setup( - name="spacy", - packages=PACKAGES, - version=about["__version__"], - ext_modules=ext_modules, - cmdclass={"build_ext": build_ext_subclass}, - ) + setup( + name="spacy", + packages=PACKAGES, + version=about["__version__"], + ext_modules=ext_modules, + cmdclass={"build_ext": build_ext_subclass}, + include_dirs=include_dirs, + package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]}, + ) if __name__ == "__main__": diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 7491a11fc..25d9f239d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,5 +1,7 @@ # cython: embedsignature=True # cython: profile=True +from __future__ import unicode_literals + from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from libc.string cimport memcpy, memset diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7ab1c1d18..d24a38029 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,3 +1,4 @@ +from __future__ import unicode_literals cimport numpy as np from libc.math cimport sqrt From 207994871106bde872bd76224fbb4cf195f01e66 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 22:49:21 +0100 Subject: [PATCH 043/187] add build dependencies back to pyproject.toml --- pyproject.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fed528d4a..8a6ababf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,11 @@ [build-system] -requires = ["setuptools"] +requires = [ + "setuptools", + "wheel", + "cython>=0.25", + "cymem>=2.0.2,<2.1.0", + "preshed>=3.0.2,<3.1.0", + "murmurhash>=0.28.0,<1.1.0", + "thinc==7.4.0.dev0", +] build-backend = "setuptools.build_meta" From 34986c7bfd1d4634861a5c4b54cf90ef18090ff4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 22:49:50 +0100 Subject: [PATCH 044/187] test versions of required libs across different places --- spacy/tests/test_requirements.py | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 spacy/tests/test_requirements.py diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py new file mode 100644 index 000000000..8c2b49b85 --- /dev/null +++ b/spacy/tests/test_requirements.py @@ -0,0 +1,61 @@ +import re +from pathlib import Path + + +def test_build_dependencies(en_vocab): + libs_ignore_requirements = ["pytest", "pytest-timeout", "mock", "flake8", "jsonschema"] + libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] + + # check requirements.txt + root_dir = Path(__file__).parent.parent.parent + req_file = root_dir / "requirements.txt" + req_dict = {} + with req_file.open() as f: + lines = f.readlines() + for line in lines: + line = line.strip() + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib and lib not in libs_ignore_requirements: + req_dict[lib] = v + + # check setup.cfg and compare to requirements.txt + # also fails when there are missing or additional libs + setup_file = root_dir / "setup.cfg" + with setup_file.open() as f: + lines = f.readlines() + setup_keys = set() + for line in lines: + line = line.strip() + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup: + req_v = req_dict.get(lib, None) + assert req_v is not None # if fail: setup.cfg contains a lib not in requirements.txt + assert (lib+v) == (lib+req_v) # if fail: setup.cfg & requirements.txt have conflicting versions + setup_keys.add(lib) + assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg + + # check pyproject.toml and compare the versions of the libs to requirements.txt + # does not fail when there are missing or additional libs + toml_file = root_dir / "pyproject.toml" + with toml_file.open() as f: + lines = f.readlines() + toml_keys = set() + for line in lines: + line = line.strip() + line = line.strip(",") + line = line.strip("\"") + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib: + req_v = req_dict.get(lib, None) + assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions + toml_keys.add(lib) + +def _parse_req(line): + lib = re.match(r"^[a-z0-9\-]*", line).group(0) + v = line.replace(lib, "").strip() + if not re.match(r"^[<>=][<>=].*", v): + return None, None + return lib, v \ No newline at end of file From 6bbd81656967fd93dfeb9af40c9194536b31a135 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 22:50:27 +0100 Subject: [PATCH 045/187] formatting --- spacy/tests/test_requirements.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 8c2b49b85..cb2f51725 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -34,7 +34,7 @@ def test_build_dependencies(en_vocab): assert req_v is not None # if fail: setup.cfg contains a lib not in requirements.txt assert (lib+v) == (lib+req_v) # if fail: setup.cfg & requirements.txt have conflicting versions setup_keys.add(lib) - assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg + assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg # check pyproject.toml and compare the versions of the libs to requirements.txt # does not fail when there are missing or additional libs @@ -53,9 +53,10 @@ def test_build_dependencies(en_vocab): assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions toml_keys.add(lib) + def _parse_req(line): lib = re.match(r"^[a-z0-9\-]*", line).group(0) v = line.replace(lib, "").strip() if not re.match(r"^[<>=][<>=].*", v): return None, None - return lib, v \ No newline at end of file + return lib, v From 2729d9164d02a6795ccf93f0b9414856644e6dbc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Feb 2020 22:59:37 +0100 Subject: [PATCH 046/187] cleanup --- spacy/tests/test_requirements.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index cb2f51725..320fc5763 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -3,6 +3,7 @@ from pathlib import Path def test_build_dependencies(en_vocab): + # Check that library requirements are pinned exactly the same across different setup files. libs_ignore_requirements = ["pytest", "pytest-timeout", "mock", "flake8", "jsonschema"] libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] @@ -41,17 +42,13 @@ def test_build_dependencies(en_vocab): toml_file = root_dir / "pyproject.toml" with toml_file.open() as f: lines = f.readlines() - toml_keys = set() for line in lines: - line = line.strip() - line = line.strip(",") - line = line.strip("\"") + line = line.strip().strip(",").strip("\"") if not line.startswith("#"): lib, v = _parse_req(line) if lib: req_v = req_dict.get(lib, None) assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions - toml_keys.add(lib) def _parse_req(line): From 80e95d02b148fd49e008058413012b757e6c7abb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 18 Feb 2020 14:32:53 +0100 Subject: [PATCH 047/187] Allow spacy attr in token pattern --- spacy/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/schemas.py b/spacy/schemas.py index 4a5054125..2268bf100 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -103,6 +103,7 @@ class TokenPattern(BaseModel): ent_type: Optional[StringValue] = None norm: Optional[StringValue] = None length: Optional[NumberValue] = None + spacy: Optional[StrictBool] = None is_alpha: Optional[StrictBool] = None is_ascii: Optional[StrictBool] = None is_digit: Optional[StrictBool] = None From 1278161f4715fa3076e4d844d2ef1b6377a855b1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 18 Feb 2020 15:17:03 +0100 Subject: [PATCH 048/187] Tidy up and fix issues --- spacy/cli/converters/conllu2json.py | 2 +- spacy/errors.py | 4 +- spacy/gold.pyx | 5 - spacy/lang/sk/examples.py | 4 - spacy/lang/sk/lex_attrs.py | 3 - spacy/lang/sk/tag_map.py | 2921 +++++++++++----------- spacy/syntax/nn_parser.pyx | 4 +- spacy/tests/regression/test_issue4849.py | 16 +- spacy/tests/regression/test_issue4924.py | 17 +- 9 files changed, 1476 insertions(+), 1500 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index f65e6a187..ecdc2ae66 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -235,7 +235,7 @@ def example_from_conllu_sentence( subtok_word = "" in_subtok = False id_ = int(id_) - 1 - head = (int(head) - 1) if head != "0" else id_ + head = (int(head) - 1) if head not in ("0", "_") else id_ tag = pos if tag == "_" else tag morph = morph if morph != "_" else "" dep = "ROOT" if dep == "root" else dep diff --git a/spacy/errors.py b/spacy/errors.py index 7ef3abc00..e6c0b069e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -541,8 +541,8 @@ class Errors(object): E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - E998 = ("Can only create GoldParse's from Example's without a Doc, " - "if get_gold_parses() is called with a Vocab object.") + E998 = ("Can only create GoldParse objects from Example objects without a " + "Doc if get_gold_parses() is called with a Vocab object.") E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5e46f274e..eca801176 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -991,11 +991,6 @@ cdef class GoldParse: self.cats = {} if cats is None else dict(cats) self.links = {} if links is None else dict(links) - # orig_annot is used as an iterator in `nlp.evalate` even if self.length == 0, - # so set a empty list to avoid error. - # if self.lenght > 0, this is modified latter. - self.orig_annot = [] - # avoid allocating memory if the doc does not contain any tokens if self.length > 0: if not words: diff --git a/spacy/lang/sk/examples.py b/spacy/lang/sk/examples.py index 486ea375e..736109a7c 100644 --- a/spacy/lang/sk/examples.py +++ b/spacy/lang/sk/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sk/lex_attrs.py b/spacy/lang/sk/lex_attrs.py index 3dea4d8f0..0caf62e8e 100644 --- a/spacy/lang/sk/lex_attrs.py +++ b/spacy/lang/sk/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/sk/tag_map.py b/spacy/lang/sk/tag_map.py index 015c8cba3..d159a6a51 100644 --- a/spacy/lang/sk/tag_map.py +++ b/spacy/lang/sk/tag_map.py @@ -1,1467 +1,1464 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON +from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB +from ...symbols import NOUN, PART, INTJ, PRON # Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html # fmt: off TAG_MAP = { - "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, - "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, - "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, - "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, - "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, - "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, - "Dx": {POS: ADV, "morph": "Degree=Pos"}, - "Dy": {POS: ADV, "morph": "Degree=Cmp"}, - "Dz": {POS: ADV, "morph": "Degree=Sup"}, - "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, - "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, - "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, - "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, - "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, - "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, - "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, - "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, - "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, - "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, - "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, - "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, - "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, - "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, - "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, - "J": {POS: INTJ, "morph": "_"}, - "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "ND": {POS: NUM, "morph": "MorphPos=Adv"}, - "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, - "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, - "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, - "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, - "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, - "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, - "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, - "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "O": {POS: CCONJ, "morph": "_"}, - "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, - "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, - "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, - "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, - "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, - "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, - "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, - "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, - "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, - "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, - "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, - "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, - "Q": {POS: X, "morph": "Hyph=Yes"}, - "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, - "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, - "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, - "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, - "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, - "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, - "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, - "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, - "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, - "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, - "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, - "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, - "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, - "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, - "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, - "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, - "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, - "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, - "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, - "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, - "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, - "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, - "T": {POS: PART, "morph": "_"}, - "TY": {POS: PART, "morph": "Mood=Cnd"}, - "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, - "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, - "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, - "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, - "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, - "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, - "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, - "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, - "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, - "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, - "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, - "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, - "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, - "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, - "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, - "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, - "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, - "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, - "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, - "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, - "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, - "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, - "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, - "W": {POS: X, "morph": "Abbr=Yes"}, - "Y": {POS: AUX, "morph": "Mood=Cnd"}, + "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"}, + "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"}, + "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"}, + "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"}, + "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"}, + "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"}, + "Dx": {POS: ADV, "morph": "Degree=Pos"}, + "Dy": {POS: ADV, "morph": "Degree=Cmp"}, + "Dz": {POS: ADV, "morph": "Degree=Sup"}, + "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"}, + "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"}, + "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"}, + "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"}, + "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"}, + "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"}, + "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"}, + "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"}, + "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"}, + "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"}, + "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"}, + "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"}, + "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"}, + "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"}, + "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"}, + "J": {POS: INTJ, "morph": "_"}, + "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "ND": {POS: NUM, "morph": "MorphPos=Adv"}, + "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"}, + "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"}, + "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"}, + "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"}, + "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"}, + "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"}, + "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"}, + "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "O": {POS: CCONJ, "morph": "_"}, + "OY": {POS: CCONJ, "morph": "Mood=Cnd"}, + "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"}, + "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"}, + "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"}, + "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"}, + "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"}, + "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"}, + "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"}, + "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"}, + "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"}, + "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"}, + "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"}, + "Q": {POS: X, "morph": "Hyph=Yes"}, + "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"}, + "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"}, + "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"}, + "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"}, + "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"}, + "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"}, + "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"}, + "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"}, + "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"}, + "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"}, + "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"}, + "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"}, + "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"}, + "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"}, + "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"}, + "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"}, + "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"}, + "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"}, + "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"}, + "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"}, + "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"}, + "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"}, + "T": {POS: PART, "morph": "_"}, + "TY": {POS: PART, "morph": "Mood=Cnd"}, + "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"}, + "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"}, + "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"}, + "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"}, + "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"}, + "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"}, + "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"}, + "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"}, + "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"}, + "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"}, + "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"}, + "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"}, + "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"}, + "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"}, + "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"}, + "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"}, + "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"}, + "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"}, + "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"}, + "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"}, + "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"}, + "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"}, + "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"}, + "W": {POS: X, "morph": "Abbr=Yes"}, + "Y": {POS: AUX, "morph": "Mood=Cnd"}, } diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 26504a3c0..8e55d3873 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -77,7 +77,7 @@ cdef class Parser: tok2vec = Tok2Vec(width=token_vector_width, embed_size=embed_size, conv_depth=conv_depth, - window_size=window_size, + window_size=conv_window, cnn_maxout_pieces=t2v_pieces, subword_features=subword_features, pretrained_vectors=pretrained_vectors, @@ -105,7 +105,7 @@ cdef class Parser: 'bilstm_depth': bilstm_depth, 'self_attn_depth': self_attn_depth, 'conv_depth': conv_depth, - 'window_size': window_size, + 'window_size': conv_window, 'embed_size': embed_size, 'cnn_maxout_pieces': t2v_pieces } diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 834219773..ddbf6f7a0 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler @@ -9,11 +6,12 @@ def test_issue4849(): nlp = English() ruler = EntityRuler( - nlp, patterns=[ - {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, - {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'}, + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, ], - phrase_matcher_attr="LOWER" + phrase_matcher_attr="LOWER", ) nlp.add_pipe(ruler) @@ -27,10 +25,10 @@ def test_issue4849(): count_ents = 0 for doc in nlp.pipe([text], n_process=1): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert(count_ents == 2) + assert count_ents == 2 # USING 2 PROCESSES count_ents = 0 for doc in nlp.pipe([text], n_process=2): count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert (count_ents == 2) + assert count_ents == 2 diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index 8aea2c3d5..5665d6d0f 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -1,16 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest - -import spacy +from spacy.language import Language -@pytest.fixture -def nlp(): - return spacy.blank("en") - - -def test_evaluate(nlp): +def test_evaluate(): + nlp = Language() docs_golds = [("", {})] - nlp.evaluate(docs_golds) + with pytest.raises(ValueError): + nlp.evaluate(docs_golds) From e3f40a6a0f590088d16dbdbc252d9304cf482cfc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 18 Feb 2020 15:38:18 +0100 Subject: [PATCH 049/187] Tidy up and auto-format --- spacy/__init__.py | 2 +- spacy/cli/__init__.py | 2 +- spacy/cli/debug_data.py | 6 +-- spacy/cli/pretrain.py | 10 ++--- spacy/cli/train.py | 26 ++++++------- spacy/cli/train_from_config.py | 36 ++++++++---------- spacy/compat.py | 2 +- spacy/displacy/templates.py | 1 - spacy/glossary.py | 1 - spacy/gold.pxd | 4 +- spacy/kb.pxd | 5 +-- spacy/lang/af/stop_words.py | 1 - spacy/lang/bg/stop_words.py | 1 - spacy/lang/bn/examples.py | 1 - spacy/lang/bn/stop_words.py | 1 - spacy/lang/ca/examples.py | 1 - spacy/lang/cs/stop_words.py | 1 - spacy/lang/da/examples.py | 1 - spacy/lang/de/examples.py | 1 - spacy/lang/de/stop_words.py | 3 +- spacy/lang/el/get_pos_from_wiktionary.py | 1 - spacy/lang/el/norm_exceptions.py | 1 - spacy/lang/el/stop_words.py | 1 - spacy/lang/en/examples.py | 1 - spacy/lang/en/norm_exceptions.py | 1 - spacy/lang/en/stop_words.py | 1 - spacy/lang/es/examples.py | 1 - spacy/lang/es/stop_words.py | 1 - spacy/lang/et/stop_words.py | 1 - spacy/lang/fa/examples.py | 1 - spacy/lang/fa/generate_verbs_exc.py | 1 - spacy/lang/fa/stop_words.py | 1 - spacy/lang/fi/stop_words.py | 1 - spacy/lang/fr/examples.py | 1 - spacy/lang/fr/stop_words.py | 1 - spacy/lang/ga/irish_morphology_helpers.py | 1 - spacy/lang/he/examples.py | 1 - spacy/lang/hi/examples.py | 1 - spacy/lang/hi/stop_words.py | 1 - spacy/lang/hu/examples.py | 1 - spacy/lang/hu/stop_words.py | 1 - spacy/lang/id/examples.py | 1 - spacy/lang/is/stop_words.py | 1 - spacy/lang/it/examples.py | 1 - spacy/lang/it/stop_words.py | 1 - spacy/lang/ja/examples.py | 1 - spacy/lang/kn/stop_words.py | 1 - spacy/lang/lt/examples.py | 1 - spacy/lang/lv/stop_words.py | 1 - spacy/lang/mr/stop_words.py | 1 - spacy/lang/nb/examples.py | 1 - spacy/lang/nl/examples.py | 1 - spacy/lang/norm_exceptions.py | 1 - spacy/lang/pl/examples.py | 1 - spacy/lang/pt/examples.py | 1 - spacy/lang/pt/stop_words.py | 1 - spacy/lang/ro/examples.py | 1 - spacy/lang/ru/examples.py | 1 - spacy/lang/ru/norm_exceptions.py | 1 - spacy/lang/si/examples.py | 1 - spacy/lang/si/stop_words.py | 1 - spacy/lang/sk/stop_words.py | 1 - spacy/lang/sl/stop_words.py | 1 - spacy/lang/sq/examples.py | 1 - spacy/lang/sq/stop_words.py | 1 - spacy/lang/sr/examples.py | 1 - spacy/lang/sr/norm_exceptions.py | 1 - spacy/lang/sr/stop_words.py | 1 - spacy/lang/sv/examples.py | 1 - spacy/lang/sv/stop_words.py | 1 - spacy/lang/ta/examples.py | 1 - spacy/lang/ta/stop_words.py | 1 - spacy/lang/te/examples.py | 1 - spacy/lang/th/norm_exceptions.py | 1 - spacy/lang/tokenizer_exceptions.py | 2 +- spacy/lang/tr/examples.py | 1 - spacy/lang/uk/examples.py | 1 - spacy/lang/ur/examples.py | 1 - spacy/lang/xx/__init__.py | 1 - spacy/lang/xx/examples.py | 1 - spacy/lang/yo/examples.py | 1 - spacy/lang/zh/examples.py | 1 - spacy/language.py | 2 +- spacy/lexeme.pyx | 2 +- spacy/ml/_character_embed.py | 8 ++-- spacy/ml/_layers.py | 17 ++++----- spacy/ml/component_models.py | 23 ++++++----- spacy/ml/extract_ngrams.py | 7 +--- spacy/ml/tok2vec.py | 38 +++++++++++-------- spacy/pipeline/hooks.py | 5 +-- spacy/pipeline/morphologizer.pyx | 4 +- spacy/pipeline/pipes.pyx | 8 ++-- spacy/pipeline/tok2vec.py | 17 ++++++--- spacy/syntax/_parser_model.pyx | 4 +- spacy/syntax/nn_parser.pyx | 18 ++++----- spacy/syntax/nonproj.pyx | 2 +- spacy/tests/doc/test_doc_api.py | 15 +++++++- spacy/tests/doc/test_morphanalysis.py | 4 +- spacy/tests/doc/test_retokenize_merge.py | 7 +++- spacy/tests/lang/ar/test_text.py | 1 - spacy/tests/lang/en/test_indices.py | 1 - spacy/tests/lang/fi/test_tokenizer.py | 12 ++---- spacy/tests/lang/hu/test_tokenizer.py | 16 ++++---- spacy/tests/lang/sv/test_text.py | 1 - spacy/tests/lang/zh/test_text.py | 1 - .../tests/morphology/test_morph_converters.py | 1 - spacy/tests/morphology/test_morph_features.py | 11 +++++- spacy/tests/parser/test_add_label.py | 3 +- spacy/tests/parser/test_ner.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 3 +- spacy/tests/pipeline/test_entity_ruler.py | 7 +--- spacy/tests/pipeline/test_tagger.py | 1 - spacy/tests/regression/test_issue1501-2000.py | 4 +- spacy/tests/regression/test_issue3611.py | 5 +-- spacy/tests/regression/test_issue4030.py | 5 +-- spacy/tests/test_architectures.py | 2 +- spacy/tests/test_cli.py | 18 ++++++--- spacy/tests/tokenizer/test_exceptions.py | 4 +- spacy/tests/tokenizer/test_tokenizer.py | 14 ++++++- spacy/tokens/_retokenize.pyx | 2 +- spacy/tokens/_serialize.py | 2 +- spacy/tokens/doc.pyx | 3 +- spacy/tokens/span.pyx | 2 +- spacy/tokens/token.pyx | 2 +- spacy/util.py | 16 +++++--- spacy/vectors.pyx | 3 +- spacy/vocab.pyx | 2 +- 127 files changed, 219 insertions(+), 275 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 4a311ec86..2c063ce24 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # These are imported as part of the API -from thinc.util import prefer_gpu, require_gpu +from thinc.api import prefer_gpu, require_gpu from . import pipeline from .cli.info import info as cli_info diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 0f7677fd2..585eaea51 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,7 +4,7 @@ from .link import link # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 from .train import train # noqa: F401 -from .train_from_config import train_from_config_cli # noqa: F401 +from .train_from_config import train_from_config_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4bcafce24..1705bf446 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -192,11 +192,7 @@ def debug_data( has_ws_ents_error = True if gold_train_data["punct_ents"]: - msg.warn( - "{} entity span(s) with punctuation".format( - gold_train_data["punct_ents"] - ) - ) + msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation") has_punct_ents_warning = True for label in new_labels: diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 2cef378c0..690e3107d 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -4,14 +4,12 @@ import time import re from collections import Counter from pathlib import Path -from thinc.layers import Linear, Maxout -from thinc.util import prefer_gpu +from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu +from thinc.api import CosineDistance, L2Distance from wasabi import msg import srsly -from thinc.layers import chain, list2array -from thinc.loss import CosineDistance, L2Distance -from spacy.gold import Example +from ..gold import Example from ..errors import Errors from ..tokens import Doc from ..attrs import ID, HEAD @@ -85,7 +83,7 @@ def pretrain( ) if not output_dir.exists(): output_dir.mkdir() - msg.good("Created output directory: {}".format(output_dir)) + msg.good(f"Created output directory: {output_dir}") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index adae91ff9..d8514095b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,7 +1,7 @@ import os import tqdm from pathlib import Path -from thinc.backends import use_ops +from thinc.api import use_ops from timeit import default_timer as timer import shutil import srsly @@ -89,7 +89,7 @@ def train( ) if not output_path.exists(): output_path.mkdir() - msg.good("Created output directory: {}".format(output_path)) + msg.good(f"Created output directory: {output_path}") tag_map = {} if tag_map_path is not None: @@ -125,17 +125,17 @@ def train( msg.text(f"Training pipeline: {pipeline}") disabled_pipes = None pipes_added = False - msg.text("Training pipeline: {}".format(pipeline)) + msg.text(f"Training pipeline: {pipeline}") if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: - msg.warn("Exception: {}".format(e)) + msg.warn(f"Exception: {e}") if activated_gpu is not None: - msg.text("Using GPU: {}".format(use_gpu)) + msg.text(f"Using GPU: {use_gpu}") else: - msg.warn("Unable to activate GPU: {}".format(use_gpu)) + msg.warn(f"Unable to activate GPU: {use_gpu}") msg.text("Using CPU only") use_gpu = -1 if base_model: @@ -158,11 +158,11 @@ def train( "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: - msg.text("Adding component to base model '{}'".format(pipe)) + msg.text(f"Adding component to base model '{pipe}'") nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: - msg.text("Replacing component from base model '{}'".format(pipe)) + msg.text(f"Replacing component from base model '{pipe}'") nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: @@ -180,7 +180,7 @@ def train( f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", exits=1, ) - msg.text("Extending component from base model '{}'".format(pipe)) + msg.text(f"Extending component from base model '{pipe}'") disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline] ) @@ -377,7 +377,7 @@ def train( msg.warn( "Did you provide the same parameters during 'train' as during 'pretrain'?" ) - msg.fail("Original error message: {}".format(e), exits=1) + msg.fail(f"Original error message: {e}", exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. @@ -504,11 +504,7 @@ def train( ) break except Exception as e: - msg.warn( - "Aborting and saving the final best model. Encountered exception: {}".format( - e - ) - ) + msg.warn(f"Aborting and saving final best model. Encountered exception: {e}") finally: best_pipes = nlp.pipe_names if disabled_pipes: diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 0488dd04c..9150da356 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,19 +1,20 @@ +from typing import Optional, Dict, List, Union, Sequence import plac -from thinc.util import require_gpu from wasabi import msg from pathlib import Path import thinc import thinc.schedules -from thinc.model import Model -from spacy.gold import GoldCorpus -import spacy -from spacy.pipeline.tok2vec import Tok2VecListener -from typing import Optional, Dict, List, Union, Sequence +from thinc.api import Model from pydantic import BaseModel, FilePath, StrictInt import tqdm -from ..ml import component_models -from .. import util +# TODO: relative imports? +import spacy +from spacy.gold import GoldCorpus +from spacy.pipeline.tok2vec import Tok2VecListener +from spacy.ml import component_models +from spacy import util + registry = util.registry @@ -153,10 +154,9 @@ def create_tb_parser_model( hidden_width: StrictInt = 64, maxout_pieces: StrictInt = 3, ): - from thinc.layers import Linear, chain, list2array + from thinc.api import Linear, chain, list2array, use_ops, zero_init from spacy.ml._layers import PrecomputableAffine from spacy.syntax._parser_model import ParserModel - from thinc.api import use_ops, zero_init token_vector_width = tok2vec.get_dim("nO") tok2vec = chain(tok2vec, list2array()) @@ -221,13 +221,9 @@ def train_from_config_cli( def train_from_config( - config_path, - data_paths, - raw_text=None, - meta_path=None, - output_path=None, + config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): - msg.info("Loading config from: {}".format(config_path)) + msg.info(f"Loading config from: {config_path}") config = util.load_from_config(config_path, create_objects=True) use_gpu = config["training"]["use_gpu"] if use_gpu >= 0: @@ -241,9 +237,7 @@ def train_from_config( msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) msg.info("Initializing the nlp pipeline") - nlp.begin_training( - lambda: corpus.train_examples, device=use_gpu - ) + nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) train_batches = create_train_batches(nlp, corpus, config["training"]) evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"]) @@ -260,7 +254,7 @@ def train_from_config( config["training"]["eval_frequency"], ) - msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate)) + msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") print_row = setup_printer(config) try: @@ -414,7 +408,7 @@ def subdivide_batch(batch): def setup_printer(config): score_cols = config["training"]["scores"] score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]] + loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] diff --git a/spacy/compat.py b/spacy/compat.py index 6fa49353e..8c5c2930b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -30,7 +30,7 @@ try: except ImportError: cupy = None -from thinc.optimizers import Optimizer # noqa: F401 +from thinc.api import Optimizer # noqa: F401 pickle = pickle copy_reg = copy_reg diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index d6970aa2f..a721ce480 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -1,4 +1,3 @@ - # Setting explicit height and max-width: none on the SVG is required for # Jupyter to render it properly in a cell diff --git a/spacy/glossary.py b/spacy/glossary.py index 5e7e531a9..938a575cd 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,4 +1,3 @@ - def explain(term): """Get a description for a given POS tag, dependency label or entity type. diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 49dba16df..aea691130 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -1,6 +1,6 @@ from cymem.cymem cimport Pool -from spacy.tokens import Doc +from .tokens import Doc from .typedefs cimport attr_t from .syntax.transition_system cimport Transition @@ -65,5 +65,3 @@ cdef class Example: cdef public TokenAnnotation token_annotation cdef public DocAnnotation doc_annotation cdef public object goldparse - - diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d5aa382b1..518ce0f4e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from libc.stdio cimport FILE -from spacy.vocab cimport Vocab +from .vocab cimport Vocab from .typedefs cimport hash_t from .structs cimport KBEntryC, AliasC @@ -113,7 +113,7 @@ cdef class KnowledgeBase: return new_index cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: - """ + """ Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 @@ -169,4 +169,3 @@ cdef class Reader: cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int _read(self, void* value, size_t size) except -1 - diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index dfd144de9..4b5a04a5e 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/stopwords-iso/stopwords-af STOP_WORDS = set( diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index 45a252bc9..aae7692a2 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/Alir3z4/stop-words STOP_WORDS = set( diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py index 051e59d84..c3be4c556 100644 --- a/spacy/lang/bn/examples.py +++ b/spacy/lang/bn/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 6bcd06b37..bf38e3254 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py index 3fbf1fb0a..ae6aa3e24 100644 --- a/spacy/lang/ca/examples.py +++ b/spacy/lang/ca/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index e8171a7e5..70aab030b 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/Alir3z4/stop-words STOP_WORDS = set( diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index e5c6448f0..80b2b925b 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py index 530ece629..735d1c316 100644 --- a/spacy/lang/de/examples.py +++ b/spacy/lang/de/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index cc5aa0f3c..f52687eb9 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ á a ab aber ach acht achte achten achter achtes ag alle allein allem allen @@ -44,7 +43,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz lang lange leicht leider lieber los machen macht machte mag magst man manche manchem manchen mancher manches mehr -mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten +mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst musste mussten na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py index 01deb23a2..369973cc0 100644 --- a/spacy/lang/el/get_pos_from_wiktionary.py +++ b/spacy/lang/el/get_pos_from_wiktionary.py @@ -1,4 +1,3 @@ - def get_pos_from_wiktionary(): import re from gensim.corpora.wikicorpus import extract_pages diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py index d540aae2c..aa774c19b 100644 --- a/spacy/lang/el/norm_exceptions.py +++ b/spacy/lang/el/norm_exceptions.py @@ -1,4 +1,3 @@ - # These exceptions are used to add NORM values based on a token's ORTH value. # Norms are only set if no alternative is provided in the tokenizer exceptions. diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index 8484826d1..7c436219f 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,4 +1,3 @@ - # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 STOP_WORDS = set( diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py index 0363a45e7..2cca9e05f 100644 --- a/spacy/lang/en/examples.py +++ b/spacy/lang/en/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py index 431d9c049..4125cd37b 100644 --- a/spacy/lang/en/norm_exceptions.py +++ b/spacy/lang/en/norm_exceptions.py @@ -1,4 +1,3 @@ - _exc = { # Slang and abbreviations "cos": "because", diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 4573c9411..1ca5cbc16 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,4 +1,3 @@ - # Stop words STOP_WORDS = set( """ diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 1c1ad631b..a1db41a16 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 3d46a88cb..004df4fca 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index 3b600a158..e1da1f14d 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/stopwords-iso/stopwords-et STOP_WORDS = set( diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py index d89feb6c8..9c6fb0345 100644 --- a/spacy/lang/fa/examples.py +++ b/spacy/lang/fa/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index 61586dc3f..62094c6de 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -1,4 +1,3 @@ - verb_roots = """ #هست آخت#آهنج diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index 372422b67..f462f2e7a 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,4 +1,3 @@ - # Stop words from HAZM package STOP_WORDS = set( """ diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index 642cfc369..8e8dcfa56 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,4 +1,3 @@ - # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections STOP_WORDS = set( diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py index 57d57f4a6..a74a62204 100644 --- a/spacy/lang/fr/examples.py +++ b/spacy/lang/fr/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index 9c12e49a3..a331f3c0f 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index c8cd36835..d606da975 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -1,4 +1,3 @@ - # fmt: off consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] broad_vowels = ["a", "á", "o", "ó", "u", "ú"] diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py index 29075c7d4..d54d2a145 100644 --- a/spacy/lang/he/examples.py +++ b/spacy/lang/he/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py index 7639ff940..ecb0b328c 100644 --- a/spacy/lang/hi/examples.py +++ b/spacy/lang/hi/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index 142fc6f47..475b07da1 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 STOP_WORDS = set( diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py index b60f752ec..711a438bd 100644 --- a/spacy/lang/hu/examples.py +++ b/spacy/lang/hu/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index 024af68f4..e39a26d35 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py index 2ce46ce5a..1069232ff 100644 --- a/spacy/lang/id/examples.py +++ b/spacy/lang/id/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index 5b3ff2f5a..917fb6df4 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/Xangis/extra-stopwords STOP_WORDS = set( diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py index 30327bd14..506721276 100644 --- a/spacy/lang/it/examples.py +++ b/spacy/lang/it/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 5cd1af137..e97613912 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py index 1d532ad77..c3a011862 100644 --- a/spacy/lang/ja/examples.py +++ b/spacy/lang/ja/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index cfeb0e69d..dba9740af 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ ಹಲವು diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py index b2889114c..eaf941f1a 100644 --- a/spacy/lang/lt/examples.py +++ b/spacy/lang/lt/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index a9612f949..2685c2430 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/stopwords-iso/stopwords-lv STOP_WORDS = set( diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 0d7501461..9b0cee951 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json STOP_WORDS = set( """ diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py index 89e265951..b1a63ad74 100644 --- a/spacy/lang/nb/examples.py +++ b/spacy/lang/nb/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py index fcefa9d62..8c8c50c60 100644 --- a/spacy/lang/nl/examples.py +++ b/spacy/lang/nl/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py index c194f05c7..f35f613b1 100644 --- a/spacy/lang/norm_exceptions.py +++ b/spacy/lang/norm_exceptions.py @@ -1,4 +1,3 @@ - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py index 6eabe1843..b1ea5880f 100644 --- a/spacy/lang/pl/examples.py +++ b/spacy/lang/pl/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py index 7427f8b25..13f3512cf 100644 --- a/spacy/lang/pt/examples.py +++ b/spacy/lang/pt/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 8065fcda7..ff45ad3a7 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py index d472f0d6d..bfa258ffc 100644 --- a/spacy/lang/ro/examples.py +++ b/spacy/lang/ro/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py index 34cf5a1eb..adb007625 100644 --- a/spacy/lang/ru/examples.py +++ b/spacy/lang/ru/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py index c5d725031..0975bf5b8 100644 --- a/spacy/lang/ru/norm_exceptions.py +++ b/spacy/lang/ru/norm_exceptions.py @@ -1,4 +1,3 @@ - _exc = { # Slang "прив": "привет", diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py index 0ff00e76e..b34051d00 100644 --- a/spacy/lang/si/examples.py +++ b/spacy/lang/si/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index 49723c860..bde662bf7 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ අතර diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index 269bdc58b..017e7beef 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/Ardevop-sk/stopwords-sk STOP_WORDS = set( diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index c8596ad0b..6fb01a183 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/stopwords-iso/stopwords-sl # TODO: probably needs to be tidied up – the list seems to have month names in # it, which shouldn't be considered stop words. diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py index e1075f70a..06ed20fa1 100644 --- a/spacy/lang/sq/examples.py +++ b/spacy/lang/sq/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index 58ee87d05..f2b1a4f4a 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,4 +1,3 @@ - # Source: https://github.com/andrixh/index-albanian STOP_WORDS = set( diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py index 1ac867f4c..ec7f57ced 100644 --- a/spacy/lang/sr/examples.py +++ b/spacy/lang/sr/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py index add8350a0..723ab84c0 100644 --- a/spacy/lang/sr/norm_exceptions.py +++ b/spacy/lang/sr/norm_exceptions.py @@ -1,4 +1,3 @@ - _exc = { # Slang "ћале": "отац", diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 488c82a75..5df5509d2 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ а diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py index 98eee700b..bc6cd7a54 100644 --- a/spacy/lang/sv/examples.py +++ b/spacy/lang/sv/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 4d933a76d..2422b2a9e 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,4 +1,3 @@ - STOP_WORDS = set( """ aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 2590163cb..a53227220 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index 83410d65e..abbff949d 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,4 +1,3 @@ - # Stop words STOP_WORDS = set( diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py index 6162b231e..cff7d3cb0 100644 --- a/spacy/lang/te/examples.py +++ b/spacy/lang/te/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py index 98b878308..b8ddbab16 100644 --- a/spacy/lang/th/norm_exceptions.py +++ b/spacy/lang/th/norm_exceptions.py @@ -1,4 +1,3 @@ - _exc = { # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) "สนุ๊กเกอร์": "สนุกเกอร์", diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index fa4e347fd..ee58a7b09 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -34,7 +34,7 @@ URL_PATTERN = ( r"|" # host & domain names # mods: match is case-sensitive, so include [A-Z] - "(?:" + "(?:" # noqa: E131 "(?:" "[A-Za-z0-9\u00a1-\uffff]" "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py index a14d87a46..dfb324a4e 100644 --- a/spacy/lang/tr/examples.py +++ b/spacy/lang/tr/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.tr.examples import sentences diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py index d17768ea6..f75d44488 100644 --- a/spacy/lang/uk/examples.py +++ b/spacy/lang/uk/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py index 7024483b5..e55b337be 100644 --- a/spacy/lang/ur/examples.py +++ b/spacy/lang/ur/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index 2af650703..347c624fd 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -1,4 +1,3 @@ - from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 15f5c4ff8..8d63c3c20 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py index 9b875d09e..0a610f125 100644 --- a/spacy/lang/yo/examples.py +++ b/spacy/lang/yo/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py index d0715eb0d..8be1336d2 100644 --- a/spacy/lang/zh/examples.py +++ b/spacy/lang/zh/examples.py @@ -1,4 +1,3 @@ - """ Example sentences to test spaCy and its language models. diff --git a/spacy/language.py b/spacy/language.py index 3aaf0b327..1c6014cec 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,7 +4,7 @@ import weakref import functools from contextlib import contextmanager from copy import copy, deepcopy -from thinc.backends import get_current_ops +from thinc.api import get_current_ops import srsly import multiprocessing as mp from itertools import chain, cycle diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1292a46bd..5910ebfe1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -6,7 +6,7 @@ cimport numpy as np np.import_array() import numpy -from thinc.util import get_array_module +from thinc.api import get_array_module from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index 2ff67746f..b366f67c6 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -3,18 +3,20 @@ from thinc.api import Model def CharacterEmbed(nM, nC): # nM: Number of dimensions per character. nC: Number of characters. - nO = nM*nC if (nM is not None and nC is not None) else None + nO = nM * nC if (nM is not None and nC is not None) else None return Model( "charembed", forward, init=init, dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256}, - params={"E": None} + params={"E": None}, ).initialize() def init(model, X=None, Y=None): - vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")) + vectors_table = model.ops.alloc3f( + model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM") + ) model.set_param("E", vectors_table) diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py index e6aa798e7..7e9150d8b 100644 --- a/spacy/ml/_layers.py +++ b/spacy/ml/_layers.py @@ -1,5 +1,4 @@ -from thinc.model import Model -from thinc.api import normal_init +from thinc.api import Model, normal_init def PrecomputableAffine(nO, nI, nF, nP): @@ -20,9 +19,7 @@ def forward(model, X, is_train): nP = model.get_dim("nP") nI = model.get_dim("nI") W = model.get_param("W") - Yf = model.ops.gemm( - X, W.reshape((nF * nO * nP, nI)), trans2=True - ) + Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) @@ -37,14 +34,14 @@ def forward(model, X, is_train): # for b in range(nB): # for f in range(nF): # dYf[b, ids[b, f]] += dY[b] - # + # # However, we avoid building that array for efficiency -- and just pass # in the indices. dY, ids = dY_ids assert dY.ndim == 3 assert dY.shape[1] == nO, dY.shape assert dY.shape[2] == nP, dY.shape - nB = dY.shape[0] + # nB = dY.shape[0] model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids)) Xf = X[ids] Xf = Xf.reshape((Xf.shape[0], nF * nI)) @@ -83,12 +80,12 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # for f in range(nF): # if ids[b, f] < 0: # d_padding[0, f] += dY[b] - # + # # Which can be rewritten as: # # for b in range(nB): # d_pad[0, ids[b] < 0] += dY[b] - # + # # I don't know how to avoid the loop without building a whole array :(. # Cursed numpy. d_pad = model.ops.alloc((1, nF, nO, nP)) @@ -118,7 +115,7 @@ def init(model, X=None, Y=None): pad = model.ops.alloc4f(1, nF, nO, nP) ops = model.ops - W = normal_init(ops, W.shape, fan_in=nF*nI) + W = normal_init(ops, W.shape, fan_in=nF * nI) model.set_param("W", W) model.set_param("b", b) model.set_param("pad", pad) diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py index a24c2bfce..8c694f950 100644 --- a/spacy/ml/component_models.py +++ b/spacy/ml/component_models.py @@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued -from thinc.api import zero_init, glorot_uniform_init +from thinc.api import zero_init def build_text_classifier(arch, config): @@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO")) else: # TODO: experiment with init_w=zero_init - output_layer = ( - Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) - >> Logistic() - ) + output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic() model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nr_class) @@ -149,13 +146,21 @@ def Tok2Vec( with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0) + prefix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0 + ) + suffix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0 + ) + shape = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0 + ) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: - glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0) + glove = StaticVectors( + vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0 + ) if subword_features: embed = uniqued( diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index 1ec5b5fc1..d4195b9a4 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -1,5 +1,5 @@ import numpy -from thinc.model import Model +from thinc.api import Model from ..attrs import LOWER @@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool): # The dtype here matches what thinc is expecting -- which differs per # platform (by int definition). This should be fixed once the problem # is fixed on Thinc's side. - lengths = self.ops.asarray( - [arr.shape[0] for arr in batch_keys], dtype=numpy.int_ - ) + lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_) batch_keys = self.ops.xp.concatenate(batch_keys) batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") @@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool): return dY return (batch_keys, batch_vals, lengths), backprop - diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 102b88604..5e51bc47a 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,11 +1,8 @@ -from thinc.layers import chain, clone, concatenate, with_array, uniqued -from thinc.model import Model -from thinc.layers import noop, with_padded -from thinc.layers import Maxout, expand_window -from thinc.layers import HashEmbed, StaticVectors -from thinc.layers import residual, LayerNorm, FeatureExtractor +from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop +from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors +from thinc.api import residual, LayerNorm, FeatureExtractor -from spacy.ml import _character_embed +from ..ml import _character_embed from ..util import make_layer, registry @@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config): nW = config["window_size"] nP = config["pieces"] depth = config["depth"] - - cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True) + cnn = ( + expand_window(window_size=nW), + Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), + ) model = clone(residual(cnn), depth) model.set_dim("nO", nO) model.attrs["receptive_field"] = nW * depth @@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config): @registry.architectures.register("spacy.MishWindowEncoder.v1") def MishWindowEncoder(config): - from thinc.layers import Mish + from thinc.api import Mish nO = config["width"] nW = config["window_size"] depth = config["depth"] - - cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO)) + cnn = chain( + expand_window(window_size=nW), + Mish(nO=nO, nI=nO * ((nW * 2) + 1)), + LayerNorm(nO), + ) model = clone(residual(cnn), depth) model.set_dim("nO", nO) return model @@ -118,14 +120,20 @@ def MishWindowEncoder(config): @registry.architectures.register("spacy.PretrainedVectors.v1") def PretrainedVectors(config): # TODO: actual vectors instead of name - return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0) + return StaticVectors( + vectors=config["vectors_name"], + nO=config["width"], + column=config["column"], + dropout=0.0, + ) @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") def TorchBiLSTMEncoder(config): import torch.nn - # TODO FIX - from thinc.layers import PyTorchRNNWrapper + + # TODO: FIX + from thinc.api import PyTorchRNNWrapper width = config["width"] depth = config["depth"] diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 00c328e81..d48b04bd1 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -1,4 +1,4 @@ -from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity +from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity from .pipes import Pipe from ..language import component @@ -63,8 +63,7 @@ class SimilarityHook(Pipe): @classmethod def Model(cls, length): return siamese( - concatenate(reduce_max(), reduce_mean()), - CauchySimilarity(length * 2) + concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2) ) def __call__(self, doc): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 7b9e4b04e..999132b35 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -3,8 +3,8 @@ from collections import defaultdict import numpy cimport numpy as np -from thinc.layers import chain, list2array -from thinc.util import to_categorical, copy_array, get_array_module +from thinc.api import chain, list2array, to_categorical, get_array_module +from thinc.util import copy_array from .. import util from .pipes import Pipe diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index c77281b2c..ad75d2e78 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,11 +3,9 @@ import numpy import srsly import random -from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array -from thinc.initializers import zero_init -from thinc.loss import CosineDistance -from thinc.util import to_categorical, get_array_module -from thinc.model import set_dropout_rate +from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array +from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module +from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 9857c87eb..8290468cf 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,3 +1,5 @@ +from thinc.api import Model, set_dropout_rate + from .pipes import Pipe from ..gold import Example from ..tokens import Doc @@ -5,8 +7,6 @@ from ..vocab import Vocab from ..language import component from ..util import link_vectors_to_models, minibatch, registry, eg2doc -from thinc.model import Model, set_dropout_rate - @component("tok2vec", assigns=["doc.tensor"]) class Tok2Vec(Pipe): @@ -39,7 +39,9 @@ class Tok2Vec(Pipe): self.listeners = [] def create_listener(self): - listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO")) + listener = Tok2VecListener( + upstream_name="tok2vec", width=self.model.get_dim("nO") + ) self.listeners.append(listener) def add_listener(self, listener): @@ -112,10 +114,10 @@ class Tok2Vec(Pipe): docs = [docs] set_dropout_rate(self.model, drop) tokvecs, bp_tokvecs = self.model.begin_update(docs) - + def capture_losses(d_tokvecs): """Accumulate tok2vec loss before doing backprop.""" - l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs) + l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs) if self.name in losses: losses[self.name] += l2_loss / len(d_tokvecs) else: @@ -133,7 +135,9 @@ class Tok2Vec(Pipe): def get_loss(self, docs, golds, scores): pass - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): + def begin_training( + self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs + ): """Allocate models and pre-process training data get_examples (function): Function returning example training data. @@ -151,6 +155,7 @@ class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, for instance from a component earlier in the pipeline. """ + name = "tok2vec-listener" def __init__(self, upstream_name, width): diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index cb8e1d127..442233f19 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc from cymem.cymem cimport Pool from thinc.extra.search cimport Beam -from thinc.layers import Linear -from thinc.model import Model -from thinc.backends import CupyOps, NumpyOps, use_ops +from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8e55d3873..cf57e1cf6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,11 +1,8 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -import numpy cimport cython.parallel -import numpy.random cimport numpy as np -from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free from cymem.cymem cimport Pool from thinc.extra.search cimport Beam -from thinc.layers import chain, clone, Linear, list2array -from thinc.backends import NumpyOps, CupyOps, use_ops -from thinc.util import get_array_module from thinc.backends.linalg cimport Vec, VecVec -from thinc.initializers import zero_init -from thinc.model import set_dropout_rate -import srsly -from spacy.gold import Example +from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops +from thinc.api import get_array_module, zero_init, set_dropout_rate +from itertools import islice +import srsly +import numpy.random +import numpy + +from ..gold import Example from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index f024c1f05..27516ffd9 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -6,7 +6,7 @@ scheme. """ from copy import copy -from spacy.gold import Example +from ..gold import Example from ..tokens.doc cimport Doc, set_children_from_heads from ..errors import Errors diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index b7627b175..4323bb736 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,4 +1,3 @@ - import pytest import numpy from spacy.tokens import Doc, Span @@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab): def test_doc_from_array_sent_starts(en_vocab): words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] - deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] + deps = [ + "ROOT", + "dep", + "dep", + "dep", + "dep", + "dep", + "ROOT", + "dep", + "dep", + "dep", + "dep", + ] doc = Doc(en_vocab, words=words) for i, (dep, head) in enumerate(zip(deps, heads)): doc[i].dep_ = dep diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 82fb549ba..221b6f683 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -29,7 +29,9 @@ def test_morph_props(i_has): def test_morph_iter(i_has): assert set(i_has[0].morph) == set(["PronType=prs"]) - assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]) + assert set(i_has[1].morph) == set( + ["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"] + ) def test_morph_get(i_has): diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 17bcd2c64..5e564d1f2 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -8,7 +8,12 @@ from ..util import get_doc def test_doc_retokenize_merge(en_tokenizer): text = "WKRO played songs by the beach boys all night" - attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"} + attrs = { + "tag": "NAMED", + "lemma": "LEMMA", + "ent_type": "TYPE", + "morph": "Number=Plur", + } doc = en_tokenizer(text) assert len(doc) == 9 with doc.retokenize() as retokenizer: diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py index f4a8cc1e3..c5ab376f1 100644 --- a/spacy/tests/lang/ar/test_text.py +++ b/spacy/tests/lang/ar/test_text.py @@ -1,4 +1,3 @@ - def test_ar_tokenizer_handles_long_text(ar_tokenizer): text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها، diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py index d50c75fc5..93daeec30 100644 --- a/spacy/tests/lang/en/test_indices.py +++ b/spacy/tests/lang/en/test_indices.py @@ -1,4 +1,3 @@ - def test_en_simple_punct(en_tokenizer): text = "to walk, do foo" tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 02aa63207..bcd62f239 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -19,16 +19,10 @@ HYPHENATED_TESTS = [ ABBREVIATION_INFLECTION_TESTS = [ ( "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", - ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] + ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"], ), - ( - "ALV:n osuus on 24 %.", - ["ALV:n", "osuus", "on", "24", "%", "."] - ), - ( - "Hiihtäjä oli kilpailun 14:s.", - ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] - ) + ("ALV:n osuus on 24 %.", ["ALV:n", "osuus", "on", "24", "%", "."]), + ("Hiihtäjä oli kilpailun 14:s.", ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]), ] diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index d0d8c2268..fd3acd0a0 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -294,12 +294,7 @@ WIKI_TESTS = [ ] EXTRA_TESTS = ( - DOT_TESTS - + QUOTE_TESTS - + NUMBER_TESTS - + HYPHEN_TESTS - + WIKI_TESTS - + TYPO_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS ) # normal: default tests + 10% of extra tests @@ -308,7 +303,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0]) # slow: remaining 90% of extra tests SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] -TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS]) +TESTS.extend( + [ + pytest.param(x[0], x[1], marks=pytest.mark.slow()) + if not isinstance(x[0], tuple) + else x + for x in SLOW_TESTS + ] +) @pytest.mark.parametrize("text,expected_tokens", TESTS) diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py index dc4911ab6..1e26c45bc 100644 --- a/spacy/tests/lang/sv/test_text.py +++ b/spacy/tests/lang/sv/test_text.py @@ -1,4 +1,3 @@ - def test_sv_tokenizer_handles_long_text(sv_tokenizer): text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, höet var uppställt i stackar nere vid den gröna ängen, och där gick storken på sina långa, diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py index d48feaee5..d9a65732e 100644 --- a/spacy/tests/lang/zh/test_text.py +++ b/spacy/tests/lang/zh/test_text.py @@ -1,4 +1,3 @@ - import pytest diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py index 3bff4f924..9486cad45 100644 --- a/spacy/tests/morphology/test_morph_converters.py +++ b/spacy/tests/morphology/test_morph_converters.py @@ -1,4 +1,3 @@ -import pytest from spacy.morphology import Morphology diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 0d8d7dea9..f644a5867 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -24,13 +24,20 @@ def test_add_morphology_with_int_ids(morphology): morphology.strings.add("gen") morphology.strings.add("Number") morphology.strings.add("sing") - morphology.add({get_string_id("Case"): get_string_id("gen"), get_string_id("Number"): get_string_id("sing")}) + morphology.add( + { + get_string_id("Case"): get_string_id("gen"), + get_string_id("Number"): get_string_id("sing"), + } + ) def test_add_morphology_with_mix_strings_and_ints(morphology): morphology.strings.add("PunctSide") morphology.strings.add("ini") - morphology.add({get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"}) + morphology.add( + {get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"} + ) def test_morphology_tags_hash_distinctly(morphology): diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 25892ac71..fe847a6ae 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,6 +1,5 @@ import pytest -from thinc.optimizers import Adam -from thinc.backends import NumpyOps +from thinc.api import Adam, NumpyOps from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8dda1f406..9a4d21a8d 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -10,7 +10,7 @@ from spacy.tokens import Doc TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), - ] +] @pytest.fixture diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 5e56442b5..c6c1240a8 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,6 +1,5 @@ import pytest -from thinc.optimizers import Adam -from thinc.backends import NumpyOps +from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 234603e94..b04569e22 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -149,10 +149,5 @@ def test_entity_ruler_validate(nlp): def test_entity_ruler_properties(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - assert sorted(ruler.labels) == sorted([ - "HELLO", - "BYE", - "COMPLEX", - "TECH_ORG" - ]) + assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) assert sorted(ruler.ent_ids) == ["a1", "a2"] diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 6a6ec8665..366cd4f1a 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,5 +1,4 @@ import pytest -import srsly from spacy.language import Language diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index d9e1d663a..2bfdbd7c3 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,9 @@ def test_issue1963(en_tokenizer): def test_issue1967(label): ner = EntityRecognizer(Vocab()) example = Example(doc=None) - example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) + example.set_token_annotation( + ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] + ) ner.moves.get_actions(gold_parses=[example]) diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index fca884356..120cea1d2 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -39,8 +39,5 @@ def test_issue3611(): for batch in batches: nlp.update( - examples=batch, - sgd=optimizer, - drop=0.1, - losses=losses, + examples=batch, sgd=optimizer, drop=0.1, losses=losses, ) diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index 7153594db..7158d9b21 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -39,10 +39,7 @@ def test_issue4030(): for batch in batches: nlp.update( - examples=batch, - sgd=optimizer, - drop=0.1, - losses=losses, + examples=batch, sgd=optimizer, drop=0.1, losses=losses, ) # processing of an empty doc should result in 0.0 for all categories diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index 786e2cedf..31b2a2d2f 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -1,6 +1,6 @@ import pytest from spacy import registry -from thinc.layers import Linear +from thinc.api import Linear from catalogue import RegistryError diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 049858960..306adc881 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -65,8 +65,9 @@ def test_cli_converters_conllu2json_subtokens(): "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1, merge_subtokens=True, - append_morphology=True) + converted = conllu2json( + input_data, n_sents=1, merge_subtokens=True, append_morphology=True + ) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 @@ -81,11 +82,16 @@ def test_cli_converters_conllu2json_subtokens(): "NOUN__Definite=Ind|Gender=Masc|Number=Sing", "PROPN_X__Gender=Fem,Masc|Tense=past", "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin", - "PUNCT" + "PUNCT", ] - assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT'] - assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', ''] - assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.'] + assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"] + assert [t["morph"] for t in tokens] == [ + "Definite=Ind|Gender=Masc|Number=Sing", + "Gender=Fem,Masc|Tense=past", + "Mood=Ind|Tense=Pres|VerbForm=Fin", + "", + ] + assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 8276d7aea..9a98e049e 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -4,7 +4,9 @@ import pytest def test_tokenizer_handles_emoticons(tokenizer): # Tweebo challenge (CMU) - text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ....""" + text = ( + """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ....""" + ) tokens = tokenizer(text) assert tokens[0].text == ":o" assert tokens[1].text == ":/" diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 3dce1ae31..c035559b4 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -130,7 +130,19 @@ def test_tokenizer_special_cases_with_affixes(tokenizer): tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) tokenizer.add_special_case("A/B", [{"orth": "A/B"}]) doc = tokenizer(text) - assert [token.text for token in doc] == ["(", "(", "(", "_SPECIAL_", "A/B", ",", "A/B", "-", "A/B", '"', ")"] + assert [token.text for token in doc] == [ + "(", + "(", + "(", + "_SPECIAL_", + "A/B", + ",", + "A/B", + "-", + "A/B", + '"', + ")", + ] def test_tokenizer_special_cases_with_period(tokenizer): diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index ec7e8a9e8..337c154a2 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -4,8 +4,8 @@ from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free from cymem.cymem cimport Pool -from thinc.util import get_array_module +from thinc.api import get_array_module import numpy from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 4a18acd77..65b70d1b3 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,7 +1,7 @@ import numpy import zlib import srsly -from thinc.backends import NumpyOps +from thinc.api import NumpyOps from ..compat import copy_reg from ..tokens import Doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 16ef5f966..54d92f8b1 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -11,7 +11,8 @@ import numpy import numpy.linalg import struct import srsly -from thinc.util import get_array_module, copy_array +from thinc.api import get_array_module +from thinc.util import copy_array from .span cimport Span from .token cimport Token diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 796a5e674..d6b50b5f4 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -4,7 +4,7 @@ from libc.math cimport sqrt import numpy import numpy.linalg -from thinc.util import get_array_module +from thinc.api import get_array_module from collections import defaultdict from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c241cd5ad..379da6c77 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -7,7 +7,7 @@ cimport numpy as np np.import_array() import numpy -from thinc.util import get_array_module +from thinc.api import get_array_module from ..typedefs cimport hash_t from ..lexeme cimport Lexeme diff --git a/spacy/util.py b/spacy/util.py index 0cc11cef7..995ff722f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -5,13 +5,9 @@ import re from pathlib import Path import random from typing import List - import thinc import thinc.config -from thinc.backends import NumpyOps, get_current_ops -from thinc.optimizers import Adam -from thinc.util import require_gpu - +from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu import functools import itertools import numpy.random @@ -797,5 +793,13 @@ def create_default_optimizer(): eps = env_opt("optimizer_eps", 1e-8) L2 = env_opt("L2_penalty", 1e-6) grad_clip = env_opt("grad_norm_clip", 1.0) - optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip) + optimizer = Adam( + learn_rate, + L2=L2, + beta1=beta1, + beta2=beta2, + eps=eps, + ops=ops, + grad_clip=grad_clip, + ) return optimizer diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f812acac4..0ade8b280 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -5,8 +5,7 @@ from libcpp.set cimport set as cppset import functools import numpy import srsly -from thinc.util import get_array_module -from thinc.backends import get_current_ops +from thinc.api import get_array_module, get_current_ops from .strings cimport StringStore diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3da9978c4..a1929559f 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -2,7 +2,7 @@ from libc.string cimport memcpy import srsly -from thinc.util import get_array_module +from thinc.api import get_array_module from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme From 09cbeaef27c910a6f235c94641efee25c904b4e0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 18 Feb 2020 17:20:17 +0100 Subject: [PATCH 050/187] Remove symlinks, data dir and related stuff --- spacy/cli/__init__.py | 12 +++- spacy/cli/download.py | 33 ++------- spacy/cli/info.py | 27 +++---- spacy/cli/link.py | 73 ------------------- spacy/cli/validate.py | 147 ++++++++++++--------------------------- spacy/compat.py | 28 -------- spacy/data/__init__.py | 0 spacy/errors.py | 9 +-- spacy/tests/test_misc.py | 47 ------------- spacy/util.py | 53 ++++---------- 10 files changed, 82 insertions(+), 347 deletions(-) delete mode 100644 spacy/cli/link.py delete mode 100644 spacy/data/__init__.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 0f7677fd2..5f83b26c1 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,13 +1,21 @@ +from wasabi import msg + from .download import download # noqa: F401 from .info import info # noqa: F401 -from .link import link # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 from .train import train # noqa: F401 -from .train_from_config import train_from_config_cli # noqa: F401 +from .train_from_config import train_from_config_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 + + +def link(*args, **kwargs): + msg.warn( + "As of spaCy v3.0, model symlinks are deprecated. You can load models " + "using their full names or from a directory path." + ) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 7388bf615..0230e272d 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -4,8 +4,6 @@ import subprocess import sys from wasabi import msg -from .link import link -from ..util import get_package_path from .. import about @@ -15,9 +13,9 @@ def download( *pip_args: ("Additional arguments to be passed to `pip install` on model install"), ): """ - Download compatible model from default download path using pip. Model - can be shortcut, model name or, if --direct flag is set, full model name - with version. For direct downloads, the compatibility check will be skipped. + Download compatible model from default download path using pip. If --direct + flag is set, the command expects the full model name with version. + For direct downloads, the compatibility check will be skipped. """ if not require_package("spacy") and "--no-deps" not in pip_args: msg.warn( @@ -47,28 +45,6 @@ def download( "Download and installation successful", f"You can now load the model via spacy.load('{model_name}')", ) - # Only create symlink if the model is installed via a shortcut like 'en'. - # There's no real advantage over an additional symlink for en_core_web_sm - # and if anything, it's more error prone and causes more confusion. - if model in shortcuts: - try: - # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a - # package, which fails if model was just installed via - # subprocess - package_path = get_package_path(model_name) - link(model_name, model, force=True, model_path=package_path) - except: # noqa: E722 - # Dirty, but since spacy.download and the auto-linking is - # mostly a convenience wrapper, it's best to show a success - # message and loading instructions, even if linking fails. - msg.warn( - "Download successful but linking failed", - f"Creating a shortcut link for '{model}' didn't work (maybe you " - f"don't have admin permissions?), but you can still load " - f"the model via its full package name: " - f"nlp = spacy.load('{model_name}')", - ) # If a model is downloaded and then loaded within the same process, our # is_package check currently fails, because pkg_resources.working_set # is not refreshed automatically (see #3923). We're trying to work @@ -114,8 +90,7 @@ def get_version(model, comp): model = model.rsplit(".dev", 1)[0] if model not in comp: msg.fail( - f"No compatible model found for '{model}' " - f"(spaCy v{about.__version__}).", + f"No compatible model found for '{model}' (spaCy v{about.__version__})", exits=1, ) return comp[model][0] diff --git a/spacy/cli/info.py b/spacy/cli/info.py index fc8764ca8..23f766368 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -3,25 +3,26 @@ from pathlib import Path from wasabi import msg import srsly +from .validate import get_model_pkgs from .. import util from .. import about def info( - model: ("Optional shortcut link of model", "positional", None, str) = None, + model: ("Optional model name", "positional", None, str) = None, markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, silent: ("Don't print anything (just return)", "flag", "s") = False, ): """ - Print info about spaCy installation. If a model shortcut link is - speficied as an argument, print model information. Flag --markdown - prints details in Markdown for easy copy-pasting to GitHub issues. + Print info about spaCy installation. If a model is speficied as an argument, + print model information. Flag --markdown prints details in Markdown for easy + copy-pasting to GitHub issues. """ if model: if util.is_package(model): model_path = util.get_package_path(model) else: - model_path = util.get_data_path() / model + model_path = model meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find model meta.json", meta_path, exits=1) @@ -41,12 +42,13 @@ def info( else: msg.table(model_meta, title=title) return meta + all_models, _ = get_model_pkgs() data = { "spaCy version": about.__version__, "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": list_models(), + "Models": ", ".join(model["name"] for model in all_models.values()), } if not silent: title = "Info about spaCy" @@ -57,19 +59,6 @@ def info( return data -def list_models(): - def exclude_dir(dir_name): - # exclude common cache directories and hidden directories - exclude = ("cache", "pycache", "__pycache__") - return dir_name in exclude or dir_name.startswith(".") - - data_path = util.get_data_path() - if data_path: - models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] - return ", ".join([m for m in models if not exclude_dir(m)]) - return "-" - - def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. diff --git a/spacy/cli/link.py b/spacy/cli/link.py deleted file mode 100644 index d8af469dc..000000000 --- a/spacy/cli/link.py +++ /dev/null @@ -1,73 +0,0 @@ -from pathlib import Path -from wasabi import msg - -from ..compat import symlink_to -from .. import util - - -def link( - origin: ("package name or local path to model", "positional", None, str), - link_name: ("name of shortuct link to create", "positional", None, str), - force: ("force overwriting of existing link", "flag", "f", bool) = False, - model_path=None, -): - """ - Create a symlink for models within the spacy/data directory. Accepts - either the name of a pip package, or the local path to the model data - directory. Linking models allows loading them via spacy.load(link_name). - """ - if util.is_package(origin): - model_path = util.get_package_path(origin) - else: - model_path = Path(origin) if model_path is None else Path(model_path) - if not model_path.exists(): - msg.fail( - "Can't locate model data", - f"The data should be located in {model_path}", - exits=1, - ) - data_path = util.get_data_path() - if not data_path or not data_path.exists(): - spacy_loc = Path(__file__).parent.parent - msg.fail( - f"Can't find the spaCy data path to create model symlink", - f"Make sure a directory `/data` exists within your spaCy " - f"installation and try again. The data directory should be located " - f"here: {spacy_loc}", - exits=1, - ) - link_path = util.get_data_path() / link_name - if link_path.is_symlink() and not force: - msg.fail( - f"Link '{link_name}' already exists", - "To overwrite an existing link, use the --force flag", - exits=1, - ) - elif link_path.is_symlink(): # does a symlink exist? - # NB: It's important to check for is_symlink here and not for exists, - # because invalid/outdated symlinks would return False otherwise. - link_path.unlink() - elif link_path.exists(): # does it exist otherwise? - # NB: Check this last because valid symlinks also "exist". - msg.fail( - f"Can't overwrite symlink '{link_name}'", - "This can happen if your data directory contains a directory or " - "file of the same name.", - exits=1, - ) - details = f"{model_path} --> {link_path}" - try: - symlink_to(link_path, model_path) - except: # noqa: E722 - # This is quite dirty, but just making sure other errors are caught. - msg.fail( - f"Couldn't link model to '{link_name}'", - "Creating a symlink in spacy/data failed. Make sure you have the " - "required permissions and try re-running the command as admin, or " - "use a virtualenv. You can still import the model as a module and " - "call its load() method, or create the symlink manually.", - ) - msg.text(details) - raise - msg.good("Linking successful", details) - msg.text(f"You can now load the model via spacy.load('{link_name}')") diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index b4d217f2f..a23ce3453 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,10 +1,8 @@ from pathlib import Path import sys import requests -import srsly from wasabi import msg -from ..util import get_data_path from .. import about @@ -13,6 +11,50 @@ def validate(): Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ + model_pkgs, compat = get_model_pkgs() + spacy_version = about.__version__.rsplit(".dev", 1)[0] + current_compat = compat.get(spacy_version, {}) + if not current_compat: + msg.warn(f"No compatible models found for v{spacy_version} of spaCy") + incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} + na_models = [m for m in incompat_models if m not in current_compat] + update_models = [m for m in incompat_models if m in current_compat] + spacy_dir = Path(__file__).parent.parent + + msg.divider(f"Installed models (spaCy v{about.__version__})") + msg.info(f"spaCy installation: {spacy_dir}") + + if model_pkgs: + header = ("NAME", "VERSION", "") + rows = [] + for name, data in model_pkgs.items(): + if data["compat"]: + comp = msg.text("", color="green", icon="good", no_print=True) + version = msg.text(data["version"], color="green", no_print=True) + else: + version = msg.text(data["version"], color="red", no_print=True) + comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" + rows.append((data["name"], version, comp)) + msg.table(rows, header=header) + else: + msg.text("No models found in your current environment.", exits=0) + if update_models: + msg.divider("Install updates") + msg.text("Use the following commands to update the model packages:") + cmd = "python -m spacy download {}" + print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") + if na_models: + msg.warn( + f"The following models are not available for spaCy v{about.__version__}:", + ", ".join(na_models), + ) + if incompat_models: + sys.exit(1) + + +def get_model_pkgs(): + import pkg_resources + with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: @@ -23,88 +65,11 @@ def validate(): ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] - version = about.__version__ - version = version.rsplit(".dev", 1)[0] - current_compat = compat.get(version) - if not current_compat: - msg.fail( - f"Can't find spaCy v{version} in compatibility table", - about.__compatibility__, - exits=1, - ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] - model_links = get_model_links(current_compat) - model_pkgs = get_model_pkgs(current_compat, all_models) - incompat_links = {l for l, d in model_links.items() if not d["compat"]} - incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} - incompat_models.update( - [d["name"] for _, d in model_links.items() if not d["compat"]] - ) - na_models = [m for m in incompat_models if m not in current_compat] - update_models = [m for m in incompat_models if m in current_compat] - spacy_dir = Path(__file__).parent.parent - - msg.divider(f"Installed models (spaCy v{about.__version__})") - msg.info(f"spaCy installation: {spacy_dir}") - - if model_links or model_pkgs: - header = ("TYPE", "NAME", "MODEL", "VERSION", "") - rows = [] - for name, data in model_pkgs.items(): - rows.append(get_model_row(current_compat, name, data, msg)) - for name, data in model_links.items(): - rows.append(get_model_row(current_compat, name, data, msg, "link")) - msg.table(rows, header=header) - else: - msg.text("No models found in your current environment.", exits=0) - if update_models: - msg.divider("Install updates") - msg.text("Use the following commands to update the model packages:") - cmd = "python -m spacy download {}" - print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") - if na_models: - msg.text( - f"The following models are not available for spaCy " - f"v{about.__version__}: {', '.join(na_models)}" - ) - if incompat_links: - msg.text( - f"You may also want to overwrite the incompatible links using the " - f"`python -m spacy link` command with `--force`, or remove them " - f"from the data directory. " - f"Data path: {get_data_path()}" - ) - if incompat_models or incompat_links: - sys.exit(1) - - -def get_model_links(compat): - links = {} - data_path = get_data_path() - if data_path: - models = [p for p in data_path.iterdir() if is_model_path(p)] - for model in models: - meta_path = Path(model) / "meta.json" - if not meta_path.exists(): - continue - meta = srsly.read_json(meta_path) - link = model.parts[-1] - name = meta["lang"] + "_" + meta["name"] - links[link] = { - "name": name, - "version": meta["version"], - "compat": is_compat(compat, name, meta["version"]), - } - return links - - -def get_model_pkgs(compat, all_models): - import pkg_resources - pkgs = {} for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): package = pkg_name.replace("-", "_") @@ -113,29 +78,9 @@ def get_model_pkgs(compat, all_models): pkgs[pkg_name] = { "name": package, "version": version, - "compat": is_compat(compat, package, version), + "compat": package in compat and version in compat[package], } - return pkgs - - -def get_model_row(compat, name, data, msg, model_type="package"): - if data["compat"]: - comp = msg.text("", color="green", icon="good", no_print=True) - version = msg.text(data["version"], color="green", no_print=True) - else: - version = msg.text(data["version"], color="red", no_print=True) - comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" - return (model_type, name, data["name"], version, comp) - - -def is_model_path(model_path): - exclude = ["cache", "pycache", "__pycache__"] - name = model_path.parts[-1] - return model_path.is_dir() and name not in exclude and not name.startswith(".") - - -def is_compat(compat, name, version): - return name in compat and version in compat[name] + return pkgs, compat def reformat_version(version): diff --git a/spacy/compat.py b/spacy/compat.py index 6fa49353e..be6cdb8a1 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -5,7 +5,6 @@ e.g. `unicode_`. DOCS: https://spacy.io/api/top-level#compat """ -import os import sys from thinc.util import copy_array @@ -43,33 +42,6 @@ is_linux = sys.platform.startswith("linux") is_osx = sys.platform == "darwin" -def symlink_to(orig, dest): - """Create a symlink. Used for model shortcut links. - - orig (unicode / Path): The origin path. - dest (unicode / Path): The destination path of the symlink. - """ - if is_windows: - import subprocess - - subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True) - else: - orig.symlink_to(dest) - - -def symlink_remove(link): - """Remove a symlink. Used for model shortcut links. - - link (unicode / Path): The path to the symlink. - """ - # https://stackoverflow.com/q/26554135/6400719 - if os.path.isdir(str(link)) and is_windows: - # this should only be on Py2.7 and windows - os.rmdir(str(link)) - else: - os.unlink(str(link)) - - def is_config(windows=None, linux=None, osx=None, **kwargs): """Check if a specific configuration of Python version and operating system matches the user's setup. Mostly used to display targeted error messages. diff --git a/spacy/data/__init__.py b/spacy/data/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/errors.py b/spacy/errors.py index e00df2c51..6947dbbd5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -224,13 +224,8 @@ class Errors(object): E047 = ("Can't assign a value to unregistered extension attribute " "'{name}'. Did you forget to call the `set_extension` method?") E048 = ("Can't import language {lang} from spacy.lang: {err}") - E049 = ("Can't find spaCy data directory: '{path}'. Check your " - "installation and permissions, or use spacy.util.set_data_path " - "to customise the location if necessary.") - E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut " - "link, a Python package or a valid path to a data directory.") - E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure " - "it points to a valid package (not just a data directory).") + E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " + "package or a valid path to a data directory.") E052 = ("Can't find model directory: {path}") E053 = ("Could not read meta.json from {path}") E054 = ("No valid '{setting}' setting found in model meta.json.") diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index a6bcdb50c..6d4e75a31 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,36 +4,8 @@ import ctypes from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.compat import symlink_to, symlink_remove, is_windows from spacy.ml._layers import PrecomputableAffine from spacy.ml._layers import _backprop_precomputable_affine_padding -from subprocess import CalledProcessError - - -@pytest.fixture -def symlink_target(): - return Path("./foo-target") - - -@pytest.fixture -def symlink(): - return Path("./foo-symlink") - - -@pytest.fixture(scope="function") -def symlink_setup_target(request, symlink_target, symlink): - if not symlink_target.exists(): - os.mkdir(str(symlink_target)) - # yield -- need to cleanup even if assertion fails - # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 - - def cleanup(): - # Remove symlink only if it was created - if symlink.exists(): - symlink_remove(symlink) - os.rmdir(str(symlink_target)) - - request.addfinalizer(cleanup) @pytest.fixture @@ -109,25 +81,6 @@ def test_require_gpu(): require_gpu() -def test_create_symlink_windows( - symlink_setup_target, symlink_target, symlink, is_admin -): - """Test the creation of symlinks on windows. If run as admin or not on windows it should succeed, otherwise a CalledProcessError should be raised.""" - assert symlink_target.exists() - - if is_admin or not is_windows: - try: - symlink_to(symlink, symlink_target) - assert symlink.exists() - except CalledProcessError as e: - pytest.fail(e) - else: - with pytest.raises(CalledProcessError): - symlink_to(symlink, symlink_target) - - assert not symlink.exists() - - def test_ascii_filenames(): """Test that all filenames in the project are ASCII. See: https://twitter.com/_inesmontani/status/1177941471632211968 diff --git a/spacy/util.py b/spacy/util.py index 0cc11cef7..6067333f7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -29,7 +29,6 @@ from .symbols import ORTH from .compat import cupy, CudaStream from .errors import Errors, Warnings, deprecation_warning, user_warning -_data_path = Path(__file__).parent / "data" _PRINT_ENV = False @@ -84,27 +83,6 @@ def set_lang_class(name, cls): registry.languages.register(name, func=cls) -def get_data_path(require_exists=True): - """Get path to spaCy data directory. - - require_exists (bool): Only return path if it exists, otherwise None. - RETURNS (Path or None): Data path or None. - """ - if not require_exists: - return _data_path - else: - return _data_path if _data_path.exists() else None - - -def set_data_path(path): - """Set path to spaCy data directory. - - path (unicode or Path): Path to new data directory. - """ - global _data_path - _data_path = ensure_path(path) - - def make_layer(arch_config): arch_func = registry.architectures.get(arch_config["arch"]) return arch_func(arch_config["config"]) @@ -145,18 +123,13 @@ def get_module_path(module): def load_model(name, **overrides): - """Load a model from a shortcut link, package or data path. + """Load a model from a package or data path. - name (unicode): Package name, shortcut link or model path. + name (unicode): Package name or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. """ - data_path = get_data_path() - if not data_path or not data_path.exists(): - raise IOError(Errors.E049.format(path=data_path)) - if isinstance(name, str): # in data dir / shortcut - if name in set([d.name for d in data_path.iterdir()]): - return load_model_from_link(name, **overrides) + if isinstance(name, str): # name or string path if is_package(name): # installed as package return load_model_from_package(name, **overrides) if Path(name).exists(): # path to model data directory @@ -166,16 +139,6 @@ def load_model(name, **overrides): raise IOError(Errors.E050.format(name=name)) -def load_model_from_link(name, **overrides): - """Load a model from a shortcut link, or directory in spaCy data path.""" - path = get_data_path() / name / "__init__.py" - try: - cls = import_file(name, path) - except AttributeError: - raise IOError(Errors.E051.format(name=name)) - return cls.load(**overrides) - - def load_model_from_package(name, **overrides): """Load a model from an installed package.""" cls = importlib.import_module(name) @@ -797,5 +760,13 @@ def create_default_optimizer(): eps = env_opt("optimizer_eps", 1e-8) L2 = env_opt("L2_penalty", 1e-6) grad_clip = env_opt("grad_norm_clip", 1.0) - optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip) + optimizer = Adam( + learn_rate, + L2=L2, + beta1=beta1, + beta2=beta2, + eps=eps, + ops=ops, + grad_clip=grad_clip, + ) return optimizer From b20351792acba1bcd28998bed80171f5b6caa59f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Feb 2020 15:51:53 +0200 Subject: [PATCH 051/187] assert prints for more clarity --- spacy/tests/test_requirements.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 320fc5763..aaa562722 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -32,8 +32,9 @@ def test_build_dependencies(en_vocab): lib, v = _parse_req(line) if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup: req_v = req_dict.get(lib, None) - assert req_v is not None # if fail: setup.cfg contains a lib not in requirements.txt - assert (lib+v) == (lib+req_v) # if fail: setup.cfg & requirements.txt have conflicting versions + assert req_v is not None, "{} in setup.cfg but not in requirements.txt".format(lib) + assert (lib+v) == (lib+req_v), "{} has different version in setup.cfg and in requirements.txt: " \ + "{} and {} respectively".format(lib, v, req_v) setup_keys.add(lib) assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg @@ -48,7 +49,8 @@ def test_build_dependencies(en_vocab): lib, v = _parse_req(line) if lib: req_v = req_dict.get(lib, None) - assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions + assert (lib+v) == (lib+req_v), "{} has different version in pyproject.toml and in requirements.txt: " \ + "{} and {} respectively".format(lib, v, req_v) def _parse_req(line): From 303c4bcd4ca50569f7987c980ff2e4eb7e9c8a63 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Feb 2020 15:52:55 +0200 Subject: [PATCH 052/187] include requirements in manifest --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 1947b9140..64886cd19 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,5 +4,6 @@ include LICENSE include README.md include bin/spacy include pyproject.toml +include requirements.txt recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz From 5c2f6454706b4522cc58efc8ffca132caeba27f9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Feb 2020 16:15:56 +0200 Subject: [PATCH 053/187] root dir one level up --- spacy/tests/test_requirements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index aaa562722..5bbccf362 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -8,7 +8,7 @@ def test_build_dependencies(en_vocab): libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] # check requirements.txt - root_dir = Path(__file__).parent.parent.parent + root_dir = Path(__file__).parent.parent req_file = root_dir / "requirements.txt" req_dict = {} with req_file.open() as f: From 9834527f2c373708252c37998b7573291fc9da63 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Feb 2020 16:22:48 +0200 Subject: [PATCH 054/187] hack to switch between CLI folder setup and local setup --- spacy/tests/test_requirements.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 5bbccf362..a6fa20d6b 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -8,11 +8,21 @@ def test_build_dependencies(en_vocab): libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] # check requirements.txt - root_dir = Path(__file__).parent.parent - req_file = root_dir / "requirements.txt" - req_dict = {} - with req_file.open() as f: - lines = f.readlines() + try: + # for CLI usage + root_dir = Path(__file__).parent.parent + req_file = root_dir / "requirements.txt" + req_dict = {} + with req_file.open() as f: + lines = f.readlines() + except FileNotFoundError as e: + # for local usage + root_dir = Path(__file__).parent.parent.parent + req_file = root_dir / "requirements.txt" + req_dict = {} + with req_file.open() as f: + lines = f.readlines() + for line in lines: line = line.strip() if not line.startswith("#"): From 9f1447bf7160dfdc354d8eb386ee169a330dbbca Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Feb 2020 17:09:29 +0200 Subject: [PATCH 055/187] where areth thou, file ? --- spacy/tests/test_requirements.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index a6fa20d6b..23ba792df 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -8,20 +8,26 @@ def test_build_dependencies(en_vocab): libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] # check requirements.txt + req_dict = {} try: # for CLI usage root_dir = Path(__file__).parent.parent req_file = root_dir / "requirements.txt" - req_dict = {} with req_file.open() as f: lines = f.readlines() except FileNotFoundError as e: - # for local usage - root_dir = Path(__file__).parent.parent.parent - req_file = root_dir / "requirements.txt" - req_dict = {} - with req_file.open() as f: - lines = f.readlines() + try: + # for local usage + root_dir = Path(__file__).parent.parent.parent + req_file = root_dir / "requirements.txt" + with req_file.open() as f: + lines = f.readlines() + except FileNotFoundError as e: + # where areth thou ? + root_dir = Path(__file__).parent.parent.parent.parent + req_file = root_dir / "requirements.txt" + with req_file.open() as f: + lines = f.readlines() for line in lines: line = line.strip() From 783da088eac9429852b48af38e32c4e219a95d57 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 23 Feb 2020 16:21:21 +0100 Subject: [PATCH 056/187] avoid try except --- spacy/tests/test_requirements.py | 40 ++++++++++++-------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 23ba792df..644e6f8f9 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -9,32 +9,22 @@ def test_build_dependencies(en_vocab): # check requirements.txt req_dict = {} - try: - # for CLI usage - root_dir = Path(__file__).parent.parent - req_file = root_dir / "requirements.txt" - with req_file.open() as f: - lines = f.readlines() - except FileNotFoundError as e: - try: - # for local usage - root_dir = Path(__file__).parent.parent.parent - req_file = root_dir / "requirements.txt" - with req_file.open() as f: - lines = f.readlines() - except FileNotFoundError as e: - # where areth thou ? - root_dir = Path(__file__).parent.parent.parent.parent - req_file = root_dir / "requirements.txt" - with req_file.open() as f: - lines = f.readlines() - for line in lines: - line = line.strip() - if not line.startswith("#"): - lib, v = _parse_req(line) - if lib and lib not in libs_ignore_requirements: - req_dict[lib] = v + root_dir = None + # when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up. + roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever + for r in roots: + req_file = root_dir / "requirements.txt" + if req_file.exists(): + root_dir = r + with req_file.open() as f: + lines = f.readlines() + for line in lines: + line = line.strip() + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib and lib not in libs_ignore_requirements: + req_dict[lib] = v # check setup.cfg and compare to requirements.txt # also fails when there are missing or additional libs From 0f55e5170414d90b048609eb44fbf8f27d085074 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 23 Feb 2020 16:33:58 +0100 Subject: [PATCH 057/187] assert we found the root_dir --- spacy/tests/test_requirements.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 644e6f8f9..21636766d 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -26,6 +26,8 @@ def test_build_dependencies(en_vocab): if lib and lib not in libs_ignore_requirements: req_dict[lib] = v + assert root_dir is not None, "Could not find the root directory of requirements.txt" + # check setup.cfg and compare to requirements.txt # also fails when there are missing or additional libs setup_file = root_dir / "setup.cfg" From 58568bd0cd96b2f72f8b4ea81cfcc269ad93d1f5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 23 Feb 2020 16:45:37 +0100 Subject: [PATCH 058/187] fix --- spacy/tests/test_requirements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 21636766d..7922f1f18 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -14,7 +14,7 @@ def test_build_dependencies(en_vocab): # when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up. roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever for r in roots: - req_file = root_dir / "requirements.txt" + req_file = r / "requirements.txt" if req_file.exists(): root_dir = r with req_file.open() as f: From d821c95eb05f3ad0b82601487093559d1d686a2c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 23 Feb 2020 17:38:33 +0100 Subject: [PATCH 059/187] debugging prints --- spacy/tests/test_requirements.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py index 7922f1f18..fc5aeeddd 100644 --- a/spacy/tests/test_requirements.py +++ b/spacy/tests/test_requirements.py @@ -13,7 +13,9 @@ def test_build_dependencies(en_vocab): root_dir = None # when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up. roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever + print() for r in roots: + print("inspecting dir", r, "-->", [f.name for f in r.glob(pattern="*.*")]) req_file = r / "requirements.txt" if req_file.exists(): root_dir = r @@ -33,6 +35,18 @@ def test_build_dependencies(en_vocab): setup_file = root_dir / "setup.cfg" with setup_file.open() as f: lines = f.readlines() + + # import configparser + # config = configparser.ConfigParser() + # config.read(setup_file) + # print("SECTIONS", config.sections()) + # print("options", config['options']) + # for key in config['options']: + # print("key", key) + # print("setup_requires *", config['options']['setup_requires'], "*") + # lines = config['options']['setup_requires'] + # lines += config['options']['install_requires'] + setup_keys = set() for line in lines: line = line.strip() From 6f846c2cbf1a0a2b4ceaffb83dd8e3d43a22e03e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 24 Feb 2020 09:19:08 +0100 Subject: [PATCH 060/187] removing --pyargs for testing purposes --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 054365336..316ac0c68 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -99,5 +99,5 @@ jobs: pip install dist/$SDIST displayName: 'Install from sdist' - - script: python -m pytest --pyargs spacy + - script: python -m pytest spacy displayName: 'Run tests' From 217c16c7a9f6c08c078d56fb34bc6497e8c38131 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 24 Feb 2020 09:38:43 +0100 Subject: [PATCH 061/187] running tests BEFORE deleting them ? --- azure-pipelines.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 316ac0c68..2ebc381cd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -84,6 +84,9 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' + - script: python -m pytest spacy + displayName: 'Run tests' + - script: | python setup.py build_ext --inplace python setup.py sdist --formats=gztar @@ -99,5 +102,3 @@ jobs: pip install dist/$SDIST displayName: 'Install from sdist' - - script: python -m pytest spacy - displayName: 'Run tests' From d5bfebe1c5d4772004965a450b57a3ca3119bcd2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 24 Feb 2020 10:04:24 +0100 Subject: [PATCH 062/187] it's moving day --- azure-pipelines.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2ebc381cd..779037c96 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -84,21 +84,20 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' - - script: python -m pytest spacy - displayName: 'Run tests' - - script: | python setup.py build_ext --inplace python setup.py sdist --formats=gztar displayName: 'Compile and build sdist' - - task: DeleteFiles@1 - inputs: - contents: 'spacy' - displayName: 'Delete source directory' - - bash: | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) pip install dist/$SDIST displayName: 'Install from sdist' + - script: python -m pytest spacy + displayName: 'Run tests' + + - task: DeleteFiles@1 + inputs: + contents: 'spacy' + displayName: 'Delete source directory' From c1a5ece65f18b4955c8e7e72bdf815c78290d6f4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Feb 2020 15:46:39 +0100 Subject: [PATCH 063/187] Tidy up setup and update requirements tests --- .gitignore | 5 ++ pyproject.toml | 2 +- requirements.txt | 2 +- setup.py | 26 ++++++-- spacy/tests/package/test_requirements.py | 76 ++++++++++++++++++++++ spacy/tests/test_requirements.py | 83 ------------------------ 6 files changed, 102 insertions(+), 92 deletions(-) create mode 100644 spacy/tests/package/test_requirements.py delete mode 100644 spacy/tests/test_requirements.py diff --git a/.gitignore b/.gitignore index a0af6d4d2..f39607b76 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ corpora/ keys/ *.json.gz +# Tests +spacy/tests/package/setup.cfg +spacy/tests/package/pyproject.toml +spacy/tests/package/requirements.txt + # Website website/.cache/ website/public/ diff --git a/pyproject.toml b/pyproject.toml index 8a6ababf3..8d3652a2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,6 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==7.4.0.dev0", + "thinc==8.0.0a0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index bb6bf9804..f3a7cc162 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 # Optional dependencies jsonschema>=2.6.0,<3.1.0 -pydantic>=1.0.0,<2.0.0 +pydantic>=1.3.0,<2.0.0 # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.py b/setup.py index 31f22ba3f..d850a74ac 100755 --- a/setup.py +++ b/setup.py @@ -7,15 +7,19 @@ from distutils import ccompiler, msvccompiler from setuptools import Extension, setup, find_packages import numpy from pathlib import Path +import shutil from Cython.Build import cythonize from Cython.Compiler import Options +ROOT = Path(__file__).parent +PACKAGE_ROOT = ROOT / "spacy" + + # Preserve `__doc__` on functions and classes # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options Options.docstrings = True - PACKAGES = find_packages() MOD_NAMES = [ "spacy.parts_of_speech", @@ -60,6 +64,12 @@ COMPILER_DIRECTIVES = { "embedsignature": True, "annotation_typing": False, } +# Files to copy into the package that are otherwise not included +COPY_FILES = { + ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", + ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", + ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", +} def is_new_osx(): @@ -115,25 +125,27 @@ def clean(path): def setup_package(): - root = Path(__file__).parent - if len(sys.argv) > 1 and sys.argv[1] == "clean": - return clean(root / "spacy") + return clean(PACKAGE_ROOT) - with (root / "spacy" / "about.py").open("r") as f: + with (PACKAGE_ROOT / "about.py").open("r") as f: about = {} exec(f.read(), about) + for copy_file, target_dir in COPY_FILES.items(): + shutil.copy(str(copy_file), str(target_dir)) + print(f"Copied {copy_file} -> {target_dir}") + include_dirs = [ get_python_inc(plat_specific=True), numpy.get_include(), - str(root / "include"), + str(ROOT / "include"), ] if ( ccompiler.new_compiler().compiler_type == "msvc" and msvccompiler.get_build_version() == 9 ): - include_dirs.append(str(root / "include" / "msvc9")) + include_dirs.append(str(ROOT / "include" / "msvc9")) ext_modules = [] for name in MOD_NAMES: mod_path = name.replace(".", "/") + ".pyx" diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py new file mode 100644 index 000000000..59a8569ee --- /dev/null +++ b/spacy/tests/package/test_requirements.py @@ -0,0 +1,76 @@ +import re +from pathlib import Path + + +def test_build_dependencies(): + # Check that library requirements are pinned exactly the same across different setup files. + libs_ignore_requirements = [ + "pytest", + "pytest-timeout", + "mock", + "flake8", + "jsonschema", + ] + libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] + + # check requirements.txt + req_dict = {} + + root_dir = Path(__file__).parent + req_file = root_dir / "requirements.txt" + with req_file.open() as f: + lines = f.readlines() + for line in lines: + line = line.strip() + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib and lib not in libs_ignore_requirements: + req_dict[lib] = v + # check setup.cfg and compare to requirements.txt + # also fails when there are missing or additional libs + setup_file = root_dir / "setup.cfg" + with setup_file.open() as f: + lines = f.readlines() + + setup_keys = set() + for line in lines: + line = line.strip() + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup: + req_v = req_dict.get(lib, None) + assert ( + req_v is not None + ), "{} in setup.cfg but not in requirements.txt".format(lib) + assert (lib + v) == (lib + req_v), ( + "{} has different version in setup.cfg and in requirements.txt: " + "{} and {} respectively".format(lib, v, req_v) + ) + setup_keys.add(lib) + assert sorted(setup_keys) == sorted( + req_dict.keys() + ) # if fail: requirements.txt contains a lib not in setup.cfg + + # check pyproject.toml and compare the versions of the libs to requirements.txt + # does not fail when there are missing or additional libs + toml_file = root_dir / "pyproject.toml" + with toml_file.open() as f: + lines = f.readlines() + for line in lines: + line = line.strip().strip(",").strip('"') + if not line.startswith("#"): + lib, v = _parse_req(line) + if lib: + req_v = req_dict.get(lib, None) + assert (lib + v) == (lib + req_v), ( + "{} has different version in pyproject.toml and in requirements.txt: " + "{} and {} respectively".format(lib, v, req_v) + ) + + +def _parse_req(line): + lib = re.match(r"^[a-z0-9\-]*", line).group(0) + v = line.replace(lib, "").strip() + if not re.match(r"^[<>=][<>=].*", v): + return None, None + return lib, v diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py deleted file mode 100644 index fc5aeeddd..000000000 --- a/spacy/tests/test_requirements.py +++ /dev/null @@ -1,83 +0,0 @@ -import re -from pathlib import Path - - -def test_build_dependencies(en_vocab): - # Check that library requirements are pinned exactly the same across different setup files. - libs_ignore_requirements = ["pytest", "pytest-timeout", "mock", "flake8", "jsonschema"] - libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] - - # check requirements.txt - req_dict = {} - - root_dir = None - # when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up. - roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever - print() - for r in roots: - print("inspecting dir", r, "-->", [f.name for f in r.glob(pattern="*.*")]) - req_file = r / "requirements.txt" - if req_file.exists(): - root_dir = r - with req_file.open() as f: - lines = f.readlines() - for line in lines: - line = line.strip() - if not line.startswith("#"): - lib, v = _parse_req(line) - if lib and lib not in libs_ignore_requirements: - req_dict[lib] = v - - assert root_dir is not None, "Could not find the root directory of requirements.txt" - - # check setup.cfg and compare to requirements.txt - # also fails when there are missing or additional libs - setup_file = root_dir / "setup.cfg" - with setup_file.open() as f: - lines = f.readlines() - - # import configparser - # config = configparser.ConfigParser() - # config.read(setup_file) - # print("SECTIONS", config.sections()) - # print("options", config['options']) - # for key in config['options']: - # print("key", key) - # print("setup_requires *", config['options']['setup_requires'], "*") - # lines = config['options']['setup_requires'] - # lines += config['options']['install_requires'] - - setup_keys = set() - for line in lines: - line = line.strip() - if not line.startswith("#"): - lib, v = _parse_req(line) - if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup: - req_v = req_dict.get(lib, None) - assert req_v is not None, "{} in setup.cfg but not in requirements.txt".format(lib) - assert (lib+v) == (lib+req_v), "{} has different version in setup.cfg and in requirements.txt: " \ - "{} and {} respectively".format(lib, v, req_v) - setup_keys.add(lib) - assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg - - # check pyproject.toml and compare the versions of the libs to requirements.txt - # does not fail when there are missing or additional libs - toml_file = root_dir / "pyproject.toml" - with toml_file.open() as f: - lines = f.readlines() - for line in lines: - line = line.strip().strip(",").strip("\"") - if not line.startswith("#"): - lib, v = _parse_req(line) - if lib: - req_v = req_dict.get(lib, None) - assert (lib+v) == (lib+req_v), "{} has different version in pyproject.toml and in requirements.txt: " \ - "{} and {} respectively".format(lib, v, req_v) - - -def _parse_req(line): - lib = re.match(r"^[a-z0-9\-]*", line).group(0) - v = line.replace(lib, "").strip() - if not re.match(r"^[<>=][<>=].*", v): - return None, None - return lib, v From 436b26fe0fb28118d41351aa4f94bb4bb9932cd0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Feb 2020 15:48:29 +0100 Subject: [PATCH 064/187] Revert other changes --- MANIFEST.in | 1 - azure-pipelines.yml | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 64886cd19..1947b9140 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,6 +4,5 @@ include LICENSE include README.md include bin/spacy include pyproject.toml -include requirements.txt recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5a5e8f03a..d34da39f7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -83,15 +83,15 @@ jobs: python setup.py sdist --formats=gztar displayName: 'Compile and build sdist' + - task: DeleteFiles@1 + inputs: + contents: 'spacy' + displayName: 'Delete source directory' + - bash: | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) pip install dist/$SDIST displayName: 'Install from sdist' - - script: python -m pytest spacy + - script: python -m pytest --pyargs spacy displayName: 'Run tests' - - - task: DeleteFiles@1 - inputs: - contents: 'spacy' - displayName: 'Delete source directory' From 912572e04a6fe15c515c005847f054c989d5e6f1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Feb 2020 16:01:58 +0100 Subject: [PATCH 065/187] Only copy if file exists (not if installed from sdist etc.) --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d850a74ac..d9021836f 100755 --- a/setup.py +++ b/setup.py @@ -133,8 +133,9 @@ def setup_package(): exec(f.read(), about) for copy_file, target_dir in COPY_FILES.items(): - shutil.copy(str(copy_file), str(target_dir)) - print(f"Copied {copy_file} -> {target_dir}") + if copy_file.exists(): + shutil.copy(str(copy_file), str(target_dir)) + print(f"Copied {copy_file} -> {target_dir}") include_dirs = [ get_python_inc(plat_specific=True), From b6a6cff70857b9edd0a1d2fa6a2fe62deb7a4290 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Feb 2020 16:17:23 +0100 Subject: [PATCH 066/187] Add blis to pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 8d3652a2f..71e523c7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,5 +7,6 @@ requires = [ "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", "thinc==8.0.0a0", + "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" From fc6e34c3a13b93caaed7b2c0cf60dcc0df59c0f4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 26 Feb 2020 08:44:22 +0100 Subject: [PATCH 067/187] fix bugs from porting master to develop --- .../wikidata_train_entity_linker.py | 4 +--- spacy/cli/train.py | 22 +++---------------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py index 386af7d4d..af0e68768 100644 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py @@ -175,12 +175,10 @@ def main( kb=kb, labels_discard=labels_discard, ) - docs, golds = zip(*train_batch) try: with nlp.disable_pipes(*other_pipes): nlp.update( - docs=docs, - golds=golds, + examples=train_batch, sgd=optimizer, drop=dropout, losses=losses, diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d8514095b..92f94b53d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -28,13 +28,6 @@ def train( pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner", vectors: ("Model to load vectors from", "option", "v", str) = None, replace_components: ("Replace components from base model", "flag", "R", bool) = False, - width: ("Width of CNN layers of Tok2Vec component", "option", "cw", int) = 96, - conv_depth: ("Depth of CNN layers of Tok2Vec component", "option", "cd", int) = 4, - cnn_window: ("Window size for CNN layers of Tok2Vec component", "option", "cW", int) = 1, - cnn_pieces: ("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int) = 3, - use_chars: ("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool) = False, - bilstm_depth: ("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int) = 0, - embed_rows: ("Number of embedding rows of Tok2Vec component", "option", "er", int) = 2000, n_iter: ("Number of iterations", "option", "n", int) = 30, n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None, n_examples: ("Number of examples", "option", "ns", int) = 0, @@ -232,14 +225,7 @@ def train( else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} - cfg["conv_depth"] = conv_depth - cfg["token_vector_width"] = width - cfg["bilstm_depth"] = bilstm_depth - cfg["cnn_maxout_pieces"] = cnn_pieces - cfg["embed_size"] = embed_rows - cfg["conv_window"] = cnn_window - cfg["subword_features"] = not use_chars - optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) + optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg) nlp._optimizer = None # Load in pretrained weights @@ -362,11 +348,9 @@ def train( for batch in util.minibatch_by_words(train_data, size=batch_sizes): if not batch: continue - docs, golds = zip(*batch) try: nlp.update( - docs, - golds, + batch, sgd=optimizer, drop=next(dropout_rates), losses=losses, @@ -609,7 +593,7 @@ def _get_metrics(component): elif component == "tagger": return ("tags_acc",) elif component == "ner": - return ("ents_f", "ents_p", "ents_r", "enty_per_type") + return ("ents_f", "ents_p", "ents_r", "ents_per_type") elif component == "sentrec": return ("sent_f", "sent_p", "sent_r") elif component == "textcat": From 06f0a8daa0b919edbafa966db42fc74dce5cab02 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 27 Feb 2020 18:42:27 +0100 Subject: [PATCH 068/187] Default settings to configurations (#4995) * fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build --- MANIFEST.in | 2 +- bin/ud/ud_train.py | 4 +- bin/wiki_entity_linking/train_descriptions.py | 6 +- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 12 +- .../ptb-joint-pos-dep/defaults.cfg | 11 +- examples/training/pretrain_textcat.py | 17 +- examples/training/train_textcat.py | 2 +- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 +- spacy/cli/pretrain.py | 17 +- spacy/cli/train.py | 88 ++-- spacy/cli/train_from_config.py | 139 ++----- spacy/errors.py | 15 +- spacy/language.py | 75 +++- spacy/ml/component_models.py | 227 ---------- spacy/ml/models/__init__.py | 6 + spacy/ml/models/defaults/__init__.py | 93 +++++ .../defaults/entity_linker_defaults.cfg | 12 + .../defaults/morphologizer_defaults.cfg | 14 + spacy/ml/models/defaults/ner_defaults.cfg | 15 + spacy/ml/models/defaults/parser_defaults.cfg | 15 + spacy/ml/models/defaults/sentrec_defaults.cfg | 14 + spacy/ml/models/defaults/tagger_defaults.cfg | 12 + .../models/defaults/tensorizer_defaults.cfg | 4 + spacy/ml/models/defaults/textcat_defaults.cfg | 13 + spacy/ml/models/defaults/tok2vec_defaults.cfg | 9 + spacy/ml/models/entity_linker.py | 23 ++ spacy/ml/models/multi_task.py | 29 ++ spacy/ml/models/parser.py | 33 ++ spacy/ml/models/tagger.py | 16 + spacy/ml/models/tensorizer.py | 10 + spacy/ml/models/textcat.py | 42 ++ spacy/ml/models/tok2vec.py | 390 ++++++++++++++++++ spacy/ml/tok2vec.py | 178 -------- spacy/pipeline/entityruler.py | 2 +- spacy/pipeline/hooks.py | 2 - spacy/pipeline/morphologizer.pyx | 26 +- spacy/pipeline/pipes.pyx | 356 +++------------- spacy/pipeline/tok2vec.py | 21 +- spacy/syntax/_parser_model.pyx | 65 ++- spacy/syntax/nn_parser.pyx | 221 +++------- spacy/tests/doc/test_add_entities.py | 5 +- spacy/tests/parser/test_add_label.py | 18 +- spacy/tests/parser/test_arc_eager_oracle.py | 4 +- spacy/tests/parser/test_ner.py | 57 +-- spacy/tests/parser/test_neural_parser.py | 20 +- spacy/tests/parser/test_nn_beam.py | 3 +- spacy/tests/parser/test_parse.py | 15 +- spacy/tests/parser/test_preset_sbd.py | 4 +- spacy/tests/pipeline/test_analysis.py | 3 +- spacy/tests/pipeline/test_tagger.py | 19 +- spacy/tests/pipeline/test_textcat.py | 22 +- spacy/tests/regression/test_issue1501-2000.py | 8 +- spacy/tests/regression/test_issue2001-2500.py | 3 + spacy/tests/regression/test_issue3001-3500.py | 4 +- spacy/tests/regression/test_issue3830.py | 6 +- spacy/tests/regression/test_issue4042.py | 3 +- spacy/tests/regression/test_issue4313.py | 3 +- .../tests/serialize/test_serialize_config.py | 126 ++++++ .../serialize/test_serialize_language.py | 3 +- .../serialize/test_serialize_pipeline.py | 71 ++-- spacy/tests/test_tok2vec.py | 36 +- spacy/util.py | 47 ++- 64 files changed, 1511 insertions(+), 1213 deletions(-) delete mode 100644 spacy/ml/component_models.py create mode 100644 spacy/ml/models/__init__.py create mode 100644 spacy/ml/models/defaults/__init__.py create mode 100644 spacy/ml/models/defaults/entity_linker_defaults.cfg create mode 100644 spacy/ml/models/defaults/morphologizer_defaults.cfg create mode 100644 spacy/ml/models/defaults/ner_defaults.cfg create mode 100644 spacy/ml/models/defaults/parser_defaults.cfg create mode 100644 spacy/ml/models/defaults/sentrec_defaults.cfg create mode 100644 spacy/ml/models/defaults/tagger_defaults.cfg create mode 100644 spacy/ml/models/defaults/tensorizer_defaults.cfg create mode 100644 spacy/ml/models/defaults/textcat_defaults.cfg create mode 100644 spacy/ml/models/defaults/tok2vec_defaults.cfg create mode 100644 spacy/ml/models/entity_linker.py create mode 100644 spacy/ml/models/multi_task.py create mode 100644 spacy/ml/models/parser.py create mode 100644 spacy/ml/models/tagger.py create mode 100644 spacy/ml/models/tensorizer.py create mode 100644 spacy/ml/models/textcat.py create mode 100644 spacy/ml/models/tok2vec.py create mode 100644 spacy/tests/serialize/test_serialize_config.py diff --git a/MANIFEST.in b/MANIFEST.in index 1947b9140..e6d25284f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include include *.h -recursive-include spacy *.txt *.pyx *.pxd +recursive-include spacy *.pyx *.pxd *.txt *.cfg include LICENSE include README.md include bin/spacy diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index bda22088d..aa5050f3a 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -386,8 +386,8 @@ def _load_pretrained_tok2vec(nlp, loc): weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: - if hasattr(component, "model") and hasattr(component.model, "tok2vec"): - component.tok2vec.from_bytes(weights_data) + if hasattr(component, "model") and component.model.has_ref("tok2vec"): + component.get_ref("tok2vec").from_bytes(weights_data) loaded.append(name) return loaded diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index d98bba565..b0cfbb4c6 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -1,13 +1,9 @@ -# coding: utf-8 from random import shuffle import logging import numpy as np -from thinc.model import Model -from thinc.api import chain -from thinc.loss import CosineDistance -from thinc.layers import Linear +from thinc.api import Model, chain, CosineDistance, Linear from spacy.util import create_default_optimizer diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 8cd150868..4f1a915c5 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -39,25 +39,27 @@ factory = "tagger" factory = "parser" [nlp.pipeline.tagger.model] -@architectures = "tagger_model.v1" +@architectures = "spacy.Tagger.v1" [nlp.pipeline.tagger.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.parser.model] -@architectures = "transition_based_parser.v1" +@architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 64 maxout_pieces = 3 [nlp.pipeline.parser.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] -@architectures = "hash_embed_bilstm.v1" +@architectures = "spacy.HashEmbedBiLSTM.v1" pretrained_vectors = ${nlp:vectors} width = 96 depth = 4 embed_size = 2000 +subword_features = true +char_embed = false diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 6735284a7..2ceaab0be 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -39,27 +39,28 @@ factory = "tagger" factory = "parser" [nlp.pipeline.tagger.model] -@architectures = "tagger_model.v1" +@architectures = "spacy.Tagger.v1" [nlp.pipeline.tagger.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.parser.model] -@architectures = "transition_based_parser.v1" +@architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 64 maxout_pieces = 3 [nlp.pipeline.parser.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] -@architectures = "hash_embed_cnn.v1" +@architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = ${nlp:vectors} width = 96 depth = 4 window_size = 1 embed_size = 2000 maxout_pieces = 3 +subword_features = true diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index 85d36fd66..0aefec9ef 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -20,9 +20,9 @@ import random import ml_datasets import spacy -from spacy.util import minibatch, use_gpu, compounding +from spacy.util import minibatch from spacy.pipeline import TextCategorizer -from spacy.ml.tok2vec import Tok2Vec +from spacy.ml.models.tok2vec import build_Tok2Vec_model import numpy @@ -65,9 +65,7 @@ def prefer_gpu(): def build_textcat_model(tok2vec, nr_class, width): - from thinc.model import Model - from thinc.layers import Softmax, chain, reduce_mean - from thinc.layers import list2ragged + from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged with Model.define_operators({">>": chain}): model = ( @@ -76,7 +74,7 @@ def build_textcat_model(tok2vec, nr_class, width): >> reduce_mean() >> Softmax(nr_class, width) ) - model.tok2vec = tok2vec + model.set_ref("tok2vec", tok2vec) return model @@ -97,8 +95,9 @@ def create_pipeline(width, embed_size, vectors_model): textcat = TextCategorizer( nlp.vocab, labels=["POSITIVE", "NEGATIVE"], + # TODO: replace with config version model=build_textcat_model( - Tok2Vec(width=width, embed_size=embed_size), 2, width + build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width ), ) @@ -121,7 +120,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter): def train_textcat(nlp, n_texts, n_iter=10): textcat = nlp.get_pipe("textcat") - tok2vec_weights = textcat.model.tok2vec.to_bytes() + tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes() (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) print( "Using {} examples ({} training, {} evaluation)".format( @@ -135,7 +134,7 @@ def train_textcat(nlp, n_texts, n_iter=10): other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() - textcat.model.tok2vec.from_bytes(tok2vec_weights) + textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) for i in range(n_iter): diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 4d402e04d..50c852ac1 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -74,7 +74,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: - textcat.model.tok2vec.from_bytes(file_.read()) + textcat.model.get_ref("tok2vec").from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) diff --git a/pyproject.toml b/pyproject.toml index 71e523c7c..ee28d5d42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a0", + "thinc==8.0.0a1", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index f3a7cc162..09998cdc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a0 +thinc==8.0.0a1 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 980269c35..7b3a468b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,13 +36,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a0 + thinc==8.0.0a1 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a0 + thinc==8.0.0a1 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 690e3107d..95d549254 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -11,10 +11,10 @@ import srsly from ..gold import Example from ..errors import Errors +from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc from ..attrs import ID, HEAD -from ..ml.component_models import Tok2Vec -from ..ml.component_models import masked_language_model +from ..ml.models.tok2vec import build_Tok2Vec_model from .. import util from ..util import create_default_optimizer from .train import _load_pretrained_tok2vec @@ -108,14 +108,19 @@ def pretrain( pretrained_vectors = None if not use_vectors else nlp.vocab.vectors model = create_pretraining_model( nlp, - Tok2Vec( + # TODO: replace with config + build_Tok2Vec_model( width, embed_rows, conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc - cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. + maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. + window_size=1, + char_embed=False, + nM=64, + nC=8 ), ) # Load in pretrained weights @@ -152,7 +157,7 @@ def pretrain( is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: - file_.write(model.tok2vec.to_bytes()) + file_.write(model.get_ref("tok2vec").to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, @@ -284,7 +289,7 @@ def create_pretraining_model(nlp, tok2vec): # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) - model = masked_language_model(nlp.vocab, model) + model = build_masked_language_model(nlp.vocab, model) model.set_ref("tok2vec", tok2vec) model.set_ref("output_layer", output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 92f94b53d..5667bb905 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -9,7 +9,7 @@ from wasabi import msg import contextlib import random -from ..util import create_default_optimizer +from ..util import create_default_optimizer, registry from ..util import use_gpu as set_gpu from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus @@ -111,6 +111,8 @@ def train( eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] + default_dir = Path(__file__).parent.parent / "ml" / "models" / "defaults" + # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. @@ -118,7 +120,6 @@ def train( msg.text(f"Training pipeline: {pipeline}") disabled_pipes = None pipes_added = False - msg.text(f"Training pipeline: {pipeline}") if use_gpu >= 0: activated_gpu = None try: @@ -140,16 +141,36 @@ def train( f"specified as `lang` argument ('{lang}') ", exits=1, ) + if vectors: + msg.text(f"Loading vectors from model '{vectors}'") + + nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: - pipe_cfg = {} + # first, create the model. + # Bit of a hack after the refactor to get the vectors into a default config + # use train-from-config instead :-) if pipe == "parser": - pipe_cfg = {"learn_tokens": learn_tokens} + config_loc = default_dir / "parser_defaults.cfg" + elif pipe == "tagger": + config_loc = default_dir / "tagger_defaults.cfg" + elif pipe == "ner": + config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": - pipe_cfg = { - "exclusive_classes": not textcat_multilabel, - "architecture": textcat_arch, - "positive_label": textcat_positive_label, - } + config_loc = default_dir / "textcat_defaults.cfg" + else: + raise ValueError(f"Component {pipe} currently not supported.") + pipe_cfg = util.load_config(config_loc, create_objects=False) + if vectors: + pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} + pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config + + if pipe == "parser": + pipe_cfg["learn_tokens"] = learn_tokens + elif pipe == "textcat": + pipe_cfg["exclusive_classes"] = not textcat_multilabel + pipe_cfg["architecture"] = textcat_arch + pipe_cfg["positive_label"] = textcat_positive_label + if pipe not in nlp.pipe_names: msg.text(f"Adding component to base model '{pipe}'") nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) @@ -181,26 +202,42 @@ def train( msg.text(f"Starting with blank model '{lang}'") lang_cls = util.get_lang_class(lang) nlp = lang_cls() + + if vectors: + msg.text(f"Loading vectors from model '{vectors}'") + for pipe in pipeline: + # first, create the model. + # Bit of a hack after the refactor to get the vectors into a default config + # use train-from-config instead :-) if pipe == "parser": - pipe_cfg = {"learn_tokens": learn_tokens} + config_loc = default_dir / "parser_defaults.cfg" + elif pipe == "tagger": + config_loc = default_dir / "tagger_defaults.cfg" + elif pipe == "ner": + config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": - pipe_cfg = { - "exclusive_classes": not textcat_multilabel, - "architecture": textcat_arch, - "positive_label": textcat_positive_label, - } + config_loc = default_dir / "textcat_defaults.cfg" else: - pipe_cfg = {} - nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + raise ValueError(f"Component {pipe} currently not supported.") + pipe_cfg = util.load_config(config_loc, create_objects=False) + if vectors: + pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} + pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config + + if pipe == "parser": + pipe_cfg["learn_tokens"] = learn_tokens + elif pipe == "textcat": + pipe_cfg["exclusive_classes"] = not textcat_multilabel + pipe_cfg["architecture"] = textcat_arch + pipe_cfg["positive_label"] = textcat_positive_label + + pipe = nlp.create_pipe(pipe, config=pipe_cfg) + nlp.add_pipe(pipe) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) - if vectors: - msg.text(f"Loading vector from model '{vectors}'") - _load_vectors(nlp, vectors) - # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: @@ -228,7 +265,7 @@ def train( optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg) nlp._optimizer = None - # Load in pretrained weights + # Load in pretrained weights (TODO: this may be broken in the config rewrite) if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text(f"Loaded pretrained tok2vec for: {components}") @@ -531,7 +568,7 @@ def _create_progress_bar(total): def _load_vectors(nlp, vectors): - util.load_model(vectors, vocab=nlp.vocab) + loaded_model = util.load_model(vectors, vocab=nlp.vocab) for lex in nlp.vocab: values = {} for attr, func in nlp.vocab.lex_attr_getters.items(): @@ -541,6 +578,7 @@ def _load_vectors(nlp, vectors): values[lex.vocab.strings[attr]] = func(lex.orth_) lex.set_attrs(**values) lex.is_oov = False + return loaded_model def _load_pretrained_tok2vec(nlp, loc): @@ -551,8 +589,8 @@ def _load_pretrained_tok2vec(nlp, loc): weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: - if hasattr(component, "model") and hasattr(component.model, "tok2vec"): - component.tok2vec.from_bytes(weights_data) + if hasattr(component, "model") and component.model.has_ref("tok2vec"): + component.get_ref("tok2vec").from_bytes(weights_data) loaded.append(name) return loaded diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 9150da356..0dba8a962 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,19 +1,17 @@ from typing import Optional, Dict, List, Union, Sequence +from pydantic import BaseModel, FilePath, StrictInt + import plac -from wasabi import msg +import tqdm from pathlib import Path + +from wasabi import msg import thinc import thinc.schedules from thinc.api import Model -from pydantic import BaseModel, FilePath, StrictInt -import tqdm -# TODO: relative imports? -import spacy -from spacy.gold import GoldCorpus -from spacy.pipeline.tok2vec import Tok2VecListener -from spacy.ml import component_models -from spacy import util +from ..gold import GoldCorpus +from .. import util registry = util.registry @@ -57,23 +55,24 @@ factory = "tok2vec" factory = "ner" [nlp.pipeline.ner.model] -@architectures = "transition_based_ner.v1" +@architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 3 hidden_width = 64 maxout_pieces = 3 [nlp.pipeline.ner.model.tok2vec] -@architectures = "tok2vec_tensors.v1" +@architectures = "spacy.Tok2VecTensors.v1" width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] -@architectures = "hash_embed_cnn.v1" +@architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = ${nlp:vectors} width = 128 depth = 4 window_size = 1 embed_size = 10000 maxout_pieces = 3 +subword_features = true """ @@ -113,65 +112,6 @@ class ConfigSchema(BaseModel): extra = "allow" -# Of course, these would normally decorate the functions where they're defined. -# But for now... -@registry.architectures.register("hash_embed_cnn.v1") -def hash_embed_cnn( - pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size -): - return component_models.Tok2Vec( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - conv_depth=depth, - cnn_maxout_pieces=maxout_pieces, - bilstm_depth=0, - window_size=window_size, - ) - - -@registry.architectures.register("hash_embed_bilstm.v1") -def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size): - return component_models.Tok2Vec( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - bilstm_depth=depth, - conv_depth=0, - cnn_maxout_pieces=0, - ) - - -@registry.architectures.register("tagger_model.v1") -def build_tagger_model_v1(tok2vec): - return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec) - - -@registry.architectures.register("transition_based_parser.v1") -def create_tb_parser_model( - tok2vec: Model, - nr_feature_tokens: StrictInt = 3, - hidden_width: StrictInt = 64, - maxout_pieces: StrictInt = 3, -): - from thinc.api import Linear, chain, list2array, use_ops, zero_init - from spacy.ml._layers import PrecomputableAffine - from spacy.syntax._parser_model import ParserModel - - token_vector_width = tok2vec.get_dim("nO") - tok2vec = chain(tok2vec, list2array()) - tok2vec.set_dim("nO", token_vector_width) - - lower = PrecomputableAffine( - hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces - ) - lower.set_dim("nP", maxout_pieces) - with use_ops("numpy"): - # Initialize weights at zero, as it's a classification layer. - upper = Linear(init_W=zero_init) - return ParserModel(tok2vec, lower, upper) - - @plac.annotations( # fmt: off train_path=("Location of JSON-formatted training data", "positional", None, Path), @@ -224,23 +164,25 @@ def train_from_config( config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): msg.info(f"Loading config from: {config_path}") - config = util.load_from_config(config_path, create_objects=True) + config = util.load_config(config_path, create_objects=True) use_gpu = config["training"]["use_gpu"] if use_gpu >= 0: msg.info("Using GPU") else: msg.info("Using CPU") msg.info("Creating nlp from config") - nlp = create_nlp_from_config(**config["nlp"]) + nlp_config = util.load_config(config_path, create_objects=False)["nlp"] + nlp = util.load_model_from_config(nlp_config) optimizer = config["optimizer"] - limit = config["training"]["limit"] + training = config["training"] + limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) msg.info("Initializing the nlp pipeline") nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) - train_batches = create_train_batches(nlp, corpus, config["training"]) - evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"]) + train_batches = create_train_batches(nlp, corpus, training) + evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) # Create iterator, which yields out info after each optimization step. msg.info("Start training") @@ -249,16 +191,16 @@ def train_from_config( optimizer, train_batches, evaluate, - config["training"]["dropout"], - config["training"]["patience"], - config["training"]["eval_frequency"], + training["dropout"], + training["patience"], + training["eval_frequency"], ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row = setup_printer(config) + print_row = setup_printer(training, nlp) try: - progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False) + progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) for batch, info, is_best_checkpoint in training_step_iterator: progress.update(1) if is_best_checkpoint is not None: @@ -266,9 +208,7 @@ def train_from_config( print_row(info) if is_best_checkpoint and output_path is not None: nlp.to_disk(output_path) - progress = tqdm.tqdm( - total=config["training"]["eval_frequency"], leave=False - ) + progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) finally: if output_path is not None: with nlp.use_params(optimizer.averages): @@ -280,18 +220,6 @@ def train_from_config( # msg.good("Created best model", best_model_path) -def create_nlp_from_config(lang, vectors, pipeline): - lang_class = spacy.util.get_lang_class(lang) - nlp = lang_class() - if vectors is not None: - spacy.cli.train._load_vectors(nlp, vectors) - for name, component_cfg in pipeline.items(): - factory = component_cfg.pop("factory") - component = nlp.create_pipe(factory, config=component_cfg) - nlp.add_pipe(component, name=name) - return nlp - - def create_train_batches(nlp, corpus, cfg): while True: train_examples = corpus.train_dataset( @@ -405,10 +333,10 @@ def subdivide_batch(batch): return [batch] -def setup_printer(config): - score_cols = config["training"]["scores"] +def setup_printer(training, nlp): + score_cols = training["scores"] score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]] + loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] @@ -420,20 +348,13 @@ def setup_printer(config): def print_row(info): losses = [ - "{0:.2f}".format(info["losses"].get(col, 0.0)) - for col in config["nlp"]["pipeline"] + "{0:.2f}".format(info["losses"].get(pipe_name, 0.0)) + for pipe_name in nlp.pipe_names ] scores = [ - "{0:.2f}".format(info["other_scores"].get(col, 0.0)) - for col in config["training"]["scores"] + "{0:.2f}".format(info["other_scores"].get(col, 0.0)) for col in score_cols ] data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])] msg.row(data, widths=table_widths, aligns=table_aligns) return print_row - - -@registry.architectures.register("tok2vec_tensors.v1") -def tok2vec_tensors_v1(width): - tok2vec = Tok2VecListener("tok2vec", width=width) - return tok2vec diff --git a/spacy/errors.py b/spacy/errors.py index 7a4953cce..6afbfc3c6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -106,6 +106,12 @@ class Warnings(object): "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "string \"Field1=Value1,Value2|Field2=Value3\".") + # TODO: fix numbering after merging develop into master + W098 = ("No Model config was provided to create the '{name}' component, " + "so a default configuration was used.") + W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " + "but got '{type}' instead, so ignoring it.") + @add_codes class Errors(object): @@ -227,7 +233,7 @@ class Errors(object): E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " "package or a valid path to a data directory.") E052 = ("Can't find model directory: {path}") - E053 = ("Could not read meta.json from {path}") + E053 = ("Could not read {name} from {path}") E054 = ("No valid '{setting}' setting found in model meta.json.") E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}") E056 = ("Invalid tokenizer exception: ORTH values combined don't match " @@ -345,8 +351,8 @@ class Errors(object): E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated " "in favor of the pipe name `sentencizer`, which does the same " "thing. For example, use `nlp.create_pipeline('sentencizer')`") - E109 = ("Model for component '{name}' not initialized. Did you forget to " - "load a model, or forget to call begin_training()?") + E109 = ("Component '{name}' could not be run. Did you forget to " + "call begin_training()?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E111 = ("Pickling a token is not supported, because tokens are only views " "of the parent Doc and can't exist on their own. A pickled token " @@ -532,6 +538,9 @@ class Errors(object): "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") # TODO: fix numbering after merging develop into master + E993 = ("The config for 'nlp' should include either a key 'name' to " + "refer to an existing model by name or path, or a key 'lang' " + "to create a new blank model.") E996 = ("Could not parse {file}: {msg}") E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " diff --git a/spacy/language.py b/spacy/language.py index 1c6014cec..83f8c9d21 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,7 +4,9 @@ import weakref import functools from contextlib import contextmanager from copy import copy, deepcopy -from thinc.api import get_current_ops +from pathlib import Path + +from thinc.api import get_current_ops, Config import srsly import multiprocessing as mp from itertools import chain, cycle @@ -16,7 +18,7 @@ from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer -from .util import link_vectors_to_models, create_default_optimizer +from .util import link_vectors_to_models, create_default_optimizer, registry from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -24,7 +26,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings, deprecation_warning, user_warning from . import util from . import about @@ -128,7 +130,7 @@ class Language(object): factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)} def __init__( - self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs + self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs ): """Initialise a Language object. @@ -138,6 +140,7 @@ class Language(object): object. Usually a `Tokenizer`. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. + config (Config): Configuration data for creating the pipeline components. max_length (int) : Maximum number of characters in a single text. The current v2 models may run out memory on extremely long texts, due to large internal @@ -152,6 +155,9 @@ class Language(object): user_factories = util.registry.factories.get_all() self.factories.update(user_factories) self._meta = dict(meta) + self._config = config + if not self._config: + self._config = Config() self._path = None if vocab is True: factory = self.Defaults.create_vocab @@ -170,6 +176,21 @@ class Language(object): self.max_length = max_length self._optimizer = None + from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \ + default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \ + default_tensorizer_config, default_tok2vec_config + + self.defaults = {"tagger": default_tagger_config(), + "parser": default_parser_config(), + "ner": default_ner_config(), + "textcat": default_textcat_config(), + "entity_linker": default_nel_config(), + "morphologizer": default_morphologizer_config(), + "sentrec": default_sentrec_config(), + "tensorizer": default_tensorizer_config(), + "tok2vec": default_tok2vec_config(), + } + @property def path(self): return self._path @@ -203,6 +224,10 @@ class Language(object): def meta(self, value): self._meta = value + @property + def config(self): + return self._config + # Conveniences to access pipeline components # Shouldn't be used anymore! @property @@ -293,7 +318,24 @@ class Language(object): else: raise KeyError(Errors.E002.format(name=name)) factory = self.factories[name] - return factory(self, **config) + default_config = self.defaults.get(name, None) + + # transform the model's config to an actual Model + model_cfg = None + if "model" in config: + model_cfg = config["model"] + if not isinstance(model_cfg, dict): + user_warning(Warnings.W099.format(type=type(model_cfg), pipe=name)) + model_cfg = None + del config["model"] + if model_cfg is None and default_config is not None: + user_warning(Warnings.W098) + model_cfg = default_config["model"] + model = None + if model_cfg is not None: + self.config[name] = {"model": model_cfg} + model = registry.make_from_config({"model": model_cfg}, validate=True)["model"] + return factory(self, model, **config) def add_pipe( self, component, name=None, before=None, after=None, first=None, last=None @@ -430,7 +472,10 @@ class Language(object): continue if not hasattr(proc, "__call__"): raise ValueError(Errors.E003.format(component=type(proc), name=name)) - doc = proc(doc, **component_cfg.get(name, {})) + try: + doc = proc(doc, **component_cfg.get(name, {})) + except KeyError: + raise ValueError(Errors.E109.format(name=name)) if doc is None: raise ValueError(Errors.E005.format(name=name)) return doc @@ -578,9 +623,6 @@ class Language(object): ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) - if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors.name - cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1] if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -611,8 +653,6 @@ class Language(object): if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) link_vectors_to_models(self.vocab) - if self.vocab.vectors.data.shape[1]: - cfg["pretrained_vectors"] = self.vocab.vectors if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -868,6 +908,7 @@ class Language(object): serializers["meta.json"] = lambda p: p.open("w").write( srsly.json_dumps(self.meta) ) + serializers["config.cfg"] = lambda p: self.config.to_disk(p) for name, proc in self.pipeline: if not hasattr(proc, "name"): continue @@ -895,6 +936,8 @@ class Language(object): exclude = disable path = util.ensure_path(path) deserializers = {} + if Path(path / "config.cfg").exists(): + deserializers["config.cfg"] = lambda p: self.config.from_disk(p) deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk( p @@ -933,6 +976,7 @@ class Language(object): serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) + serializers["config.cfg"] = lambda: self.config.to_bytes() for name, proc in self.pipeline: if name in exclude: continue @@ -955,6 +999,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable deserializers = {} + deserializers["config.cfg"] = lambda b: self.config.from_bytes(b) deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes( b @@ -981,8 +1026,8 @@ class component(object): and class components and will automatically register components in the Language.factories. If the component is a class and needs access to the nlp object or config parameters, it can expose a from_nlp classmethod - that takes the nlp object and **cfg arguments and returns the initialized - component. + that takes the nlp & model objects and **cfg arguments, and returns the + initialized component. """ # NB: This decorator needs to live here, because it needs to write to @@ -1011,9 +1056,9 @@ class component(object): obj.requires = self.requires obj.retokenizes = self.retokenizes - def factory(nlp, **cfg): + def factory(nlp, model, **cfg): if hasattr(obj, "from_nlp"): - return obj.from_nlp(nlp, **cfg) + return obj.from_nlp(nlp, model, **cfg) elif isinstance(obj, type): return obj() return obj diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py deleted file mode 100644 index 8c694f950..000000000 --- a/spacy/ml/component_models.py +++ /dev/null @@ -1,227 +0,0 @@ -from spacy import util -from spacy.ml.extract_ngrams import extract_ngrams - -from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE -from ..errors import Errors -from ._character_embed import CharacterEmbed - -from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged -from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors -from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain -from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued -from thinc.api import zero_init - - -def build_text_classifier(arch, config): - if arch == "cnn": - return build_simple_cnn_text_classifier(**config) - elif arch == "bow": - return build_bow_text_classifier(**config) - else: - raise ValueError("Unexpected textcat arch") - - -def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg): - """ - Build a simple CNN text classifier, given a token-to-vector model as inputs. - If exclusive_classes=True, a softmax non-linearity is applied, so that the - outputs sum to 1. If exclusive_classes=False, a logistic non-linearity - is applied instead, so that outputs are in the range [0, 1]. - """ - with Model.define_operators({">>": chain}): - if exclusive_classes: - output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO")) - else: - # TODO: experiment with init_w=zero_init - output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic() - model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer - model.set_ref("tok2vec", tok2vec) - model.set_dim("nO", nr_class) - return model - - -def build_bow_text_classifier( - nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg -): - with Model.define_operators({">>": chain}): - model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class) - model.to_cpu() - if not no_output_layer: - output_layer = ( - Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class) - ) - output_layer.to_cpu() - model = model >> output_layer - model.set_dim("nO", nr_class) - return model - - -def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): - if "entity_width" not in cfg: - raise ValueError(Errors.E144.format(param="entity_width")) - - conv_depth = cfg.get("conv_depth", 2) - cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) - pretrained_vectors = cfg.get("pretrained_vectors", None) - context_width = cfg.get("entity_width") - - with Model.define_operators({">>": chain, "**": clone}): - nel_tok2vec = Tok2Vec( - width=hidden_width, - embed_size=embed_width, - pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, - subword_features=True, - conv_depth=conv_depth, - bilstm_depth=0, - ) - - model = ( - nel_tok2vec - >> list2ragged() - >> reduce_mean() - >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0)) - >> Linear(nO=context_width, nI=hidden_width) - ) - model.initialize() - - model.set_ref("tok2vec", nel_tok2vec) - model.set_dim("nO", context_width) - return model - - -def masked_language_model(*args, **kwargs): - raise NotImplementedError - - -def build_tagger_model(nr_class, tok2vec): - token_vector_width = tok2vec.get_dim("nO") - # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init)) - model = chain(tok2vec, softmax) - model.set_ref("tok2vec", tok2vec) - model.set_ref("softmax", softmax) - return model - - -def build_morphologizer_model(class_nums, **cfg): - embed_size = util.env_opt("embed_size", 7000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 128) - pretrained_vectors = cfg.get("pretrained_vectors") - char_embed = cfg.get("char_embed", True) - with Model.define_operators({">>": chain, "+": add, "**": clone}): - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - tok2vec = Tok2Vec( - token_vector_width, - embed_size, - char_embed=char_embed, - pretrained_vectors=pretrained_vectors, - ) - softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width)) - model = tok2vec >> softmax - model.set_ref("tok2vec", tok2vec) - model.set_ref("softmax", softmax) - return model - - -def Tok2Vec( - width, - embed_size, - pretrained_vectors=None, - window_size=1, - cnn_maxout_pieces=3, - subword_features=True, - char_embed=False, - conv_depth=4, - bilstm_depth=0, -): - if char_embed: - subword_features = False - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0) - if subword_features: - prefix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0 - ) - suffix = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0 - ) - shape = HashEmbed( - nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0 - ) - else: - prefix, suffix, shape = (None, None, None) - if pretrained_vectors is not None: - glove = StaticVectors( - vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0 - ) - - if subword_features: - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> Maxout( - nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True - ), - column=cols.index(ORTH), - ) - else: - embed = uniqued( - (glove | norm) - >> Maxout( - nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True - ), - column=cols.index(ORTH), - ) - elif subword_features: - embed = uniqued( - concatenate(norm, prefix, suffix, shape) - >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True), - column=cols.index(ORTH), - ) - elif char_embed: - embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array( - norm - ) - reduce_dimensions = Maxout( - nO=width, - nI=64 * 8 + width, - nP=cnn_maxout_pieces, - dropout=0.0, - normalize=True, - ) - else: - embed = norm - - convolution = residual( - expand_window(window_size=window_size) - >> Maxout( - nO=width, - nI=width * 3, - nP=cnn_maxout_pieces, - dropout=0.0, - normalize=True, - ) - ) - if char_embed: - tok2vec = embed >> with_array( - reduce_dimensions >> convolution ** conv_depth, pad=conv_depth - ) - else: - tok2vec = FeatureExtractor(cols) >> with_array( - embed >> convolution ** conv_depth, pad=conv_depth - ) - - if bilstm_depth >= 1: - tok2vec = tok2vec >> PyTorchLSTM( - nO=width, nI=width, depth=bilstm_depth, bi=True - ) - # Work around thinc API limitations :(. TODO: Revise in Thinc 7 - tok2vec.set_dim("nO", width) - tok2vec.set_ref("embed", embed) - return tok2vec diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py new file mode 100644 index 000000000..56696d581 --- /dev/null +++ b/spacy/ml/models/__init__.py @@ -0,0 +1,6 @@ +from .entity_linker import * +from .parser import * +from .tagger import * +from .tensorizer import * +from .textcat import * +from .tok2vec import * diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py new file mode 100644 index 000000000..9af4da87d --- /dev/null +++ b/spacy/ml/models/defaults/__init__.py @@ -0,0 +1,93 @@ +from pathlib import Path + +from .... import util + + +def default_nel_config(): + loc = Path(__file__).parent / "entity_linker_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_nel(): + loc = Path(__file__).parent / "entity_linker_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_morphologizer_config(): + loc = Path(__file__).parent / "morphologizer_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_morphologizer(): + loc = Path(__file__).parent / "morphologizer_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_parser_config(): + loc = Path(__file__).parent / "parser_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_parser(): + loc = Path(__file__).parent / "parser_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_ner_config(): + loc = Path(__file__).parent / "ner_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_ner(): + loc = Path(__file__).parent / "ner_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_sentrec_config(): + loc = Path(__file__).parent / "sentrec_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_sentrec(): + loc = Path(__file__).parent / "sentrec_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_tagger_config(): + loc = Path(__file__).parent / "tagger_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_tagger(): + loc = Path(__file__).parent / "tagger_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_tensorizer_config(): + loc = Path(__file__).parent / "tensorizer_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_tensorizer(): + loc = Path(__file__).parent / "tensorizer_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_textcat_config(): + loc = Path(__file__).parent / "textcat_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_textcat(): + loc = Path(__file__).parent / "textcat_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] + + +def default_tok2vec_config(): + loc = Path(__file__).parent / "tok2vec_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_tok2vec(): + loc = Path(__file__).parent / "tok2vec_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] diff --git a/spacy/ml/models/defaults/entity_linker_defaults.cfg b/spacy/ml/models/defaults/entity_linker_defaults.cfg new file mode 100644 index 000000000..6a591ec3e --- /dev/null +++ b/spacy/ml/models/defaults/entity_linker_defaults.cfg @@ -0,0 +1,12 @@ +[model] +@architectures = "spacy.EntityLinker.v1" + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 2 +embed_size = 300 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/ml/models/defaults/morphologizer_defaults.cfg new file mode 100644 index 000000000..80e776c4f --- /dev/null +++ b/spacy/ml/models/defaults/morphologizer_defaults.cfg @@ -0,0 +1,14 @@ +[model] +@architectures = "spacy.Tagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashCharEmbedCNN.v1" +pretrained_vectors = null +width = 128 +depth = 4 +embed_size = 7000 +window_size = 1 +maxout_pieces = 3 +subword_features = true +nM = 64 +nC = 8 diff --git a/spacy/ml/models/defaults/ner_defaults.cfg b/spacy/ml/models/defaults/ner_defaults.cfg new file mode 100644 index 000000000..db2c131f5 --- /dev/null +++ b/spacy/ml/models/defaults/ner_defaults.cfg @@ -0,0 +1,15 @@ +[model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/parser_defaults.cfg b/spacy/ml/models/defaults/parser_defaults.cfg new file mode 100644 index 000000000..9cbb6eadb --- /dev/null +++ b/spacy/ml/models/defaults/parser_defaults.cfg @@ -0,0 +1,15 @@ +[model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 64 +maxout_pieces = 2 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/sentrec_defaults.cfg b/spacy/ml/models/defaults/sentrec_defaults.cfg new file mode 100644 index 000000000..a039a4533 --- /dev/null +++ b/spacy/ml/models/defaults/sentrec_defaults.cfg @@ -0,0 +1,14 @@ +[model] +@architectures = "spacy.Tagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashCharEmbedCNN.v1" +pretrained_vectors = null +width = 12 +depth = 1 +embed_size = 2000 +window_size = 1 +maxout_pieces = 2 +subword_features = true +nM = 64 +nC = 8 diff --git a/spacy/ml/models/defaults/tagger_defaults.cfg b/spacy/ml/models/defaults/tagger_defaults.cfg new file mode 100644 index 000000000..5aea80a32 --- /dev/null +++ b/spacy/ml/models/defaults/tagger_defaults.cfg @@ -0,0 +1,12 @@ +[model] +@architectures = "spacy.Tagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/tensorizer_defaults.cfg b/spacy/ml/models/defaults/tensorizer_defaults.cfg new file mode 100644 index 000000000..81880a109 --- /dev/null +++ b/spacy/ml/models/defaults/tensorizer_defaults.cfg @@ -0,0 +1,4 @@ +[model] +@architectures = "spacy.Tensorizer.v1" +input_size=96 +output_size=300 diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/ml/models/defaults/textcat_defaults.cfg new file mode 100644 index 000000000..cea1bfe54 --- /dev/null +++ b/spacy/ml/models/defaults/textcat_defaults.cfg @@ -0,0 +1,13 @@ +[model] +@architectures = "spacy.TextCatCNN.v1" +exclusive_classes = false + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/tok2vec_defaults.cfg b/spacy/ml/models/defaults/tok2vec_defaults.cfg new file mode 100644 index 000000000..9475d4aab --- /dev/null +++ b/spacy/ml/models/defaults/tok2vec_defaults.cfg @@ -0,0 +1,9 @@ +[model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py new file mode 100644 index 000000000..0c1762026 --- /dev/null +++ b/spacy/ml/models/entity_linker.py @@ -0,0 +1,23 @@ +from pathlib import Path + +from thinc.api import chain, clone, list2ragged, reduce_mean, residual +from thinc.api import Model, Maxout, Linear + +from spacy.util import registry + + +@registry.architectures.register("spacy.EntityLinker.v1") +def build_nel_encoder(tok2vec, nO=None): + with Model.define_operators({">>": chain, "**": clone}): + token_width = tok2vec.get_dim("nO") + output_layer = Linear(nO=nO, nI=token_width) + model = ( + tok2vec + >> list2ragged() + >> reduce_mean() + >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) + >> output_layer + ) + model.set_ref("output_layer", output_layer) + model.set_ref("tok2vec", tok2vec) + return model diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py new file mode 100644 index 000000000..1c193df82 --- /dev/null +++ b/spacy/ml/models/multi_task.py @@ -0,0 +1,29 @@ +from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init + + +def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): + model = chain( + tok2vec, + Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0), + LayerNorm(token_vector_width * 2), + Softmax(nO=n_tags, nI=token_vector_width * 2), + ) + return model + + +def build_cloze_multi_task_model(vocab, tok2vec): + output_size = vocab.vectors.data.shape[1] + output_layer = chain( + Maxout( + nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0 + ), + Linear(nO=output_size, nI=output_size, init_W=zero_init), + ) + model = chain(tok2vec, output_layer) + model = build_masked_language_model(vocab, model) + return model + + +def build_masked_language_model(*args, **kwargs): + # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828 + raise NotImplementedError diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py new file mode 100644 index 000000000..89f303e2a --- /dev/null +++ b/spacy/ml/models/parser.py @@ -0,0 +1,33 @@ +from pydantic import StrictInt + +from spacy.util import registry +from spacy.ml._layers import PrecomputableAffine +from spacy.syntax._parser_model import ParserModel + +from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops + + +@registry.architectures.register("spacy.TransitionBasedParser.v1") +def build_tb_parser_model( + tok2vec: Model, + nr_feature_tokens: StrictInt, + hidden_width: StrictInt, + maxout_pieces: StrictInt, + nO=None, +): + token_vector_width = tok2vec.get_dim("nO") + tok2vec = chain(tok2vec, list2array()) + tok2vec.set_dim("nO", token_vector_width) + + lower = PrecomputableAffine( + nO=hidden_width, + nF=nr_feature_tokens, + nI=tok2vec.get_dim("nO"), + nP=maxout_pieces, + ) + lower.set_dim("nP", maxout_pieces) + with use_ops("numpy"): + # Initialize weights at zero, as it's a classification layer. + upper = Linear(nO=nO, init_W=zero_init) + model = ParserModel(tok2vec, lower, upper) + return model diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py new file mode 100644 index 000000000..92e8be1b2 --- /dev/null +++ b/spacy/ml/models/tagger.py @@ -0,0 +1,16 @@ +from thinc.api import zero_init, with_array, Softmax, chain, Model + +from spacy.util import registry + + +@registry.architectures.register("spacy.Tagger.v1") +def build_tagger_model(tok2vec, nO=None) -> Model: + token_vector_width = tok2vec.get_dim("nO") + # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! + output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init) + softmax = with_array(output_layer) + model = chain(tok2vec, softmax) + model.set_ref("tok2vec", tok2vec) + model.set_ref("softmax", softmax) + model.set_ref("output_layer", output_layer) + return model diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py new file mode 100644 index 000000000..f66610b64 --- /dev/null +++ b/spacy/ml/models/tensorizer.py @@ -0,0 +1,10 @@ +from thinc.api import Linear, zero_init + +from ... import util +from ...util import registry + + +@registry.architectures.register("spacy.Tensorizer.v1") +def build_tensorizer(input_size, output_size): + input_size = util.env_opt("token_vector_width", input_size) + return Linear(output_size, input_size, init_W=zero_init) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py new file mode 100644 index 000000000..d9ac34b99 --- /dev/null +++ b/spacy/ml/models/textcat.py @@ -0,0 +1,42 @@ +from spacy.attrs import ORTH +from spacy.util import registry +from spacy.ml.extract_ngrams import extract_ngrams + +from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax + + +@registry.architectures.register("spacy.TextCatCNN.v1") +def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): + """ + Build a simple CNN text classifier, given a token-to-vector model as inputs. + If exclusive_classes=True, a softmax non-linearity is applied, so that the + outputs sum to 1. If exclusive_classes=False, a logistic non-linearity + is applied instead, so that outputs are in the range [0, 1]. + """ + with Model.define_operators({">>": chain}): + if exclusive_classes: + output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO")) + model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model.set_ref("output_layer", output_layer) + else: + # TODO: experiment with init_w=zero_init + linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) + model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() + model.set_ref("output_layer", linear_layer) + model.set_ref("tok2vec", tok2vec) + model.set_dim("nO", nO) + return model + + +@registry.architectures.register("spacy.TextCatBOW.v1") +def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None): + # Note: original defaults were ngram_size=1 and no_output_layer=False + with Model.define_operators({">>": chain}): + model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nO) + model.to_cpu() + if not no_output_layer: + output_layer = Softmax(nO) if exclusive_classes else Logistic(nO) + output_layer.to_cpu() + model = model >> output_layer + model.set_ref("output_layer", output_layer) + return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py new file mode 100644 index 000000000..2e0e4c2d4 --- /dev/null +++ b/spacy/ml/models/tok2vec.py @@ -0,0 +1,390 @@ +from thinc.api import chain, clone, concatenate, with_array, uniqued +from thinc.api import Model, noop, with_padded, Maxout, expand_window +from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM +from thinc.api import residual, LayerNorm, FeatureExtractor, Mish + +from ... import util +from ...util import registry, make_layer +from ...ml import _character_embed +from ...pipeline.tok2vec import Tok2VecListener +from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE + + +@registry.architectures.register("spacy.Tok2VecTensors.v1") +def tok2vec_tensors_v1(width): + tok2vec = Tok2VecListener("tok2vec", width=width) + return tok2vec + + +@registry.architectures.register("spacy.VocabVectors.v1") +def get_vocab_vectors(name): + nlp = util.load_model(name) + return nlp.vocab.vectors + + +@registry.architectures.register("spacy.Tok2Vec.v1") +def Tok2Vec(config): + doc2feats = make_layer(config["@doc2feats"]) + embed = make_layer(config["@embed"]) + encode = make_layer(config["@encode"]) + field_size = 0 + if encode.has_attr("receptive_field"): + field_size = encode.attrs["receptive_field"] + tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) + tok2vec.attrs["cfg"] = config + tok2vec.set_dim("nO", encode.get_dim("nO")) + tok2vec.set_ref("embed", embed) + tok2vec.set_ref("encode", encode) + return tok2vec + + +@registry.architectures.register("spacy.Doc2Feats.v1") +def Doc2Feats(config): + columns = config["columns"] + return FeatureExtractor(columns) + + +@registry.architectures.register("spacy.HashEmbedCNN.v1") +def hash_embed_cnn( + pretrained_vectors, + width, + depth, + embed_size, + maxout_pieces, + window_size, + subword_features, +): + # Does not use character embeddings: set to False by default + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + conv_depth=depth, + bilstm_depth=0, + maxout_pieces=maxout_pieces, + window_size=window_size, + subword_features=subword_features, + char_embed=False, + nM=0, + nC=0, + ) + + +@registry.architectures.register("spacy.HashCharEmbedCNN.v1") +def hash_charembed_cnn( + pretrained_vectors, + width, + depth, + embed_size, + maxout_pieces, + window_size, + subword_features, + nM=0, + nC=0, +): + # Allows using character embeddings by setting nC, nM and char_embed=True + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + conv_depth=depth, + bilstm_depth=0, + maxout_pieces=maxout_pieces, + window_size=window_size, + subword_features=subword_features, + char_embed=True, + nM=nM, + nC=nC, + ) + + +@registry.architectures.register("spacy.HashEmbedBiLSTM.v1") +def hash_embed_bilstm_v1( + pretrained_vectors, width, depth, embed_size, subword_features +): + # Does not use character embeddings: set to False by default + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + bilstm_depth=depth, + conv_depth=0, + maxout_pieces=0, + window_size=1, + subword_features=subword_features, + char_embed=False, + nM=0, + nC=0, + ) + + +@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") +def hash_embed_bilstm_v1( + pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 +): + # Allows using character embeddings by setting nC, nM and char_embed=True + return build_Tok2Vec_model( + width=width, + embed_size=embed_size, + pretrained_vectors=pretrained_vectors, + bilstm_depth=depth, + conv_depth=0, + maxout_pieces=0, + window_size=1, + subword_features=subword_features, + char_embed=True, + nM=nM, + nC=nC, + ) + + +@registry.architectures.register("spacy.MultiHashEmbed.v1") +def MultiHashEmbed(config): + # For backwards compatibility with models before the architecture registry, + # we have to be careful to get exactly the same model structure. One subtle + # trick is that when we define concatenation with the operator, the operator + # is actually binary associative. So when we write (a | b | c), we're actually + # getting concatenate(concatenate(a, b), c). That's why the implementation + # is a bit ugly here. + cols = config["columns"] + width = config["width"] + rows = config["rows"] + + norm = HashEmbed(width, rows, column=cols.index("NORM")) + if config["use_subwords"]: + prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX")) + suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX")) + shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE")) + if config.get("@pretrained_vectors"): + glove = make_layer(config["@pretrained_vectors"]) + mix = make_layer(config["@mix"]) + + with Model.define_operators({">>": chain, "|": concatenate}): + if config["use_subwords"] and config["@pretrained_vectors"]: + mix._layers[0].set_dim("nI", width * 5) + layer = uniqued( + (glove | norm | prefix | suffix | shape) >> mix, + column=cols.index("ORTH"), + ) + elif config["use_subwords"]: + mix._layers[0].set_dim("nI", width * 4) + layer = uniqued( + (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") + ) + elif config["@pretrained_vectors"]: + mix._layers[0].set_dim("nI", width * 2) + layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH")) + else: + layer = norm + layer.attrs["cfg"] = config + return layer + + +@registry.architectures.register("spacy.CharacterEmbed.v1") +def CharacterEmbed(config): + width = config["width"] + chars = config["chars"] + + chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) + other_tables = make_layer(config["@embed_features"]) + mix = make_layer(config["@mix"]) + + model = chain(concatenate(chr_embed, other_tables), mix) + model.attrs["cfg"] = config + return model + + +@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") +def MaxoutWindowEncoder(config): + nO = config["width"] + nW = config["window_size"] + nP = config["pieces"] + depth = config["depth"] + + cnn = ( + expand_window(window_size=nW), + Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), + ) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) + model.attrs["receptive_field"] = nW * depth + return model + + +@registry.architectures.register("spacy.MishWindowEncoder.v1") +def MishWindowEncoder(config): + nO = config["width"] + nW = config["window_size"] + depth = config["depth"] + + cnn = chain( + expand_window(window_size=nW), + Mish(nO=nO, nI=nO * ((nW * 2) + 1)), + LayerNorm(nO), + ) + model = clone(residual(cnn), depth) + model.set_dim("nO", nO) + return model + + +@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") +def TorchBiLSTMEncoder(config): + import torch.nn + + # TODO FIX + from thinc.api import PyTorchRNNWrapper + + width = config["width"] + depth = config["depth"] + if depth == 0: + return noop() + return with_padded( + PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) + ) + + +# TODO: update +_EXAMPLE_CONFIG = { + "@doc2feats": { + "arch": "Doc2Feats", + "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]}, + }, + "@embed": { + "arch": "spacy.MultiHashEmbed.v1", + "config": { + "width": 96, + "rows": 2000, + "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"], + "use_subwords": True, + "@pretrained_vectors": { + "arch": "TransformedStaticVectors", + "config": { + "vectors_name": "en_vectors_web_lg.vectors", + "width": 96, + "column": 0, + }, + }, + "@mix": { + "arch": "LayerNormalizedMaxout", + "config": {"width": 96, "pieces": 3}, + }, + }, + }, + "@encode": { + "arch": "MaxoutWindowEncode", + "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3}, + }, +} + + +def build_Tok2Vec_model( + width, + embed_size, + pretrained_vectors, + window_size, + maxout_pieces, + subword_features, + char_embed, + nM, + nC, + conv_depth, + bilstm_depth, +) -> Model: + if char_embed: + subword_features = False + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM)) + if subword_features: + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX)) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX)) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE)) + else: + prefix, suffix, shape = (None, None, None) + if pretrained_vectors is not None: + glove = StaticVectors( + vectors=pretrained_vectors.data, + nO=width, + column=cols.index(ID), + dropout=0.0, + ) + + if subword_features: + columns = 5 + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> Maxout( + nO=width, + nI=width * columns, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), + column=cols.index(ORTH), + ) + else: + columns = 2 + embed = uniqued( + (glove | norm) + >> Maxout( + nO=width, + nI=width * columns, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), + column=cols.index(ORTH), + ) + elif subword_features: + columns = 4 + embed = uniqued( + concatenate(norm, prefix, suffix, shape) + >> Maxout( + nO=width, + nI=width * columns, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), + column=cols.index(ORTH), + ) + elif char_embed: + embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | FeatureExtractor( + cols + ) >> with_array(norm) + reduce_dimensions = Maxout( + nO=width, + nI=nM * nC + width, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ) + else: + embed = norm + + convolution = residual( + expand_window(window_size=window_size) + >> Maxout( + nO=width, + nI=width * ((window_size * 2) + 1), + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ) + ) + if char_embed: + tok2vec = embed >> with_array( + reduce_dimensions >> convolution ** conv_depth, pad=conv_depth + ) + else: + tok2vec = FeatureExtractor(cols) >> with_array( + embed >> convolution ** conv_depth, pad=conv_depth + ) + + if bilstm_depth >= 1: + tok2vec = tok2vec >> PyTorchLSTM( + nO=width, nI=width, depth=bilstm_depth, bi=True + ) + tok2vec.set_dim("nO", width) + tok2vec.set_ref("embed", embed) + return tok2vec diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 5e51bc47a..e69de29bb 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,178 +0,0 @@ -from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop -from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors -from thinc.api import residual, LayerNorm, FeatureExtractor - -from ..ml import _character_embed -from ..util import make_layer, registry - - -@registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec(config): - doc2feats = make_layer(config["@doc2feats"]) - embed = make_layer(config["@embed"]) - encode = make_layer(config["@encode"]) - field_size = 0 - if encode.has_attr("receptive_field"): - field_size = encode.attrs["receptive_field"] - tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) - tok2vec.attrs["cfg"] = config - tok2vec.set_dim("nO", encode.get_dim("nO")) - tok2vec.set_ref("embed", embed) - tok2vec.set_ref("encode", encode) - return tok2vec - - -@registry.architectures.register("spacy.Doc2Feats.v1") -def Doc2Feats(config): - columns = config["columns"] - return FeatureExtractor(columns) - - -@registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(config): - # For backwards compatibility with models before the architecture registry, - # we have to be careful to get exactly the same model structure. One subtle - # trick is that when we define concatenation with the operator, the operator - # is actually binary associative. So when we write (a | b | c), we're actually - # getting concatenate(concatenate(a, b), c). That's why the implementation - # is a bit ugly here. - cols = config["columns"] - width = config["width"] - rows = config["rows"] - - norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0) - if config["use_subwords"]: - prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0) - suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0) - shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0) - if config.get("@pretrained_vectors"): - glove = make_layer(config["@pretrained_vectors"]) - mix = make_layer(config["@mix"]) - - with Model.define_operators({">>": chain, "|": concatenate}): - if config["use_subwords"] and config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 5) - layer = uniqued( - (glove | norm | prefix | suffix | shape) >> mix, - column=cols.index("ORTH"), - ) - elif config["use_subwords"]: - mix._layers[0].set_dim("nI", width * 4) - layer = uniqued( - (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") - ) - elif config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 2) - layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),) - else: - layer = norm - layer.attrs["cfg"] = config - return layer - - -@registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(config): - width = config["width"] - chars = config["chars"] - - chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) - other_tables = make_layer(config["@embed_features"]) - mix = make_layer(config["@mix"]) - - model = chain(concatenate(chr_embed, other_tables), mix) - model.attrs["cfg"] = config - return model - - -@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(config): - nO = config["width"] - nW = config["window_size"] - nP = config["pieces"] - depth = config["depth"] - cnn = ( - expand_window(window_size=nW), - Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", nO) - model.attrs["receptive_field"] = nW * depth - return model - - -@registry.architectures.register("spacy.MishWindowEncoder.v1") -def MishWindowEncoder(config): - from thinc.api import Mish - - nO = config["width"] - nW = config["window_size"] - depth = config["depth"] - cnn = chain( - expand_window(window_size=nW), - Mish(nO=nO, nI=nO * ((nW * 2) + 1)), - LayerNorm(nO), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", nO) - return model - - -@registry.architectures.register("spacy.PretrainedVectors.v1") -def PretrainedVectors(config): - # TODO: actual vectors instead of name - return StaticVectors( - vectors=config["vectors_name"], - nO=config["width"], - column=config["column"], - dropout=0.0, - ) - - -@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def TorchBiLSTMEncoder(config): - import torch.nn - - # TODO: FIX - from thinc.api import PyTorchRNNWrapper - - width = config["width"] - depth = config["depth"] - if depth == 0: - return noop() - return with_padded( - PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) - ) - - -# TODO: update -_EXAMPLE_CONFIG = { - "@doc2feats": { - "arch": "Doc2Feats", - "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]}, - }, - "@embed": { - "arch": "spacy.MultiHashEmbed.v1", - "config": { - "width": 96, - "rows": 2000, - "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"], - "use_subwords": True, - "@pretrained_vectors": { - "arch": "TransformedStaticVectors", - "config": { - "vectors_name": "en_vectors_web_lg.vectors", - "width": 96, - "column": 0, - }, - }, - "@mix": { - "arch": "LayerNormalizedMaxout", - "config": {"width": 96, "pieces": 3}, - }, - }, - }, - "@encode": { - "arch": "MaxoutWindowEncode", - "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3}, - }, -} diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index e211acb44..06c568ac9 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -66,7 +66,7 @@ class EntityRuler(object): self.add_patterns(patterns) @classmethod - def from_nlp(cls, nlp, **cfg): + def from_nlp(cls, nlp, model=None, **cfg): return cls(nlp, **cfg) def __len__(self): diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index d48b04bd1..351323ae9 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -76,11 +76,9 @@ class SimilarityHook(Pipe): yield self(doc) def predict(self, doc1, doc2): - self.require_model() return self.model.predict([(doc1, doc2)]) def update(self, doc1_doc2, golds, sgd=None, drop=0.0): - self.require_model() sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 999132b35..b6a6045d1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -15,25 +15,15 @@ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology -from ..ml.component_models import build_morphologizer_model - @component("morphologizer", assigns=["token.morph", "token.pos"]) class Morphologizer(Pipe): - @classmethod - def Model(cls, **cfg): - if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): - raise ValueError(TempErrors.T008) - class_map = Morphology.create_class_map() - return build_morphologizer_model(class_map.field_sizes, **cfg) - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault('cnn_maxout_pieces', 2) - self._class_map = self.vocab.morphology.create_class_map() + self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ? @property def labels(self): @@ -58,6 +48,14 @@ class Morphologizer(Pipe): self.set_annotations(docs, features, tensors=tokvecs) yield from docs + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, + **kwargs): + self.set_output(len(self.labels)) + self.model.initialize() + if sgd is None: + sgd = self.create_optimizer() + return sgd + def predict(self, docs): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. @@ -65,8 +63,8 @@ class Morphologizer(Pipe): guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO"))) return guesses, tokvecs - tokvecs = self.model.tok2vec(docs) - scores = self.model.softmax(tokvecs) + tokvecs = self.model.get_ref("tok2vec")(docs) + scores = self.model.get_ref("softmax")(tokvecs) return scores, tokvecs def set_annotations(self, docs, batch_scores, tensors=None): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ad75d2e78..b9bf1ccd6 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,8 +3,7 @@ import numpy import srsly import random -from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array -from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module +from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc @@ -22,11 +21,6 @@ from ..attrs import POS, ID from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X from ..kb import KnowledgeBase -from ..ml.component_models import Tok2Vec, build_tagger_model -from ..ml.component_models import build_text_classifier -from ..ml.component_models import build_simple_cnn_text_classifier -from ..ml.component_models import build_bow_text_classifier, build_nel_encoder -from ..ml.component_models import masked_language_model from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -47,13 +41,8 @@ class Pipe(object): name = None @classmethod - def Model(cls, *shape, **kwargs): - """Initialize a model for the pipe.""" - raise NotImplementedError - - @classmethod - def from_nlp(cls, nlp, **cfg): - return cls(nlp.vocab, **cfg) + def from_nlp(cls, nlp, model, **cfg): + return cls(nlp.vocab, model, **cfg) def _get_doc(self, example): """ Use this method if the `example` can be both a Doc or an Example """ @@ -61,7 +50,7 @@ class Pipe(object): return example return example.doc - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): """Create a new pipe instance.""" raise NotImplementedError @@ -72,7 +61,6 @@ class Pipe(object): Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - self.require_model() doc = self._get_doc(example) predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: @@ -85,11 +73,6 @@ class Pipe(object): return example return doc - def require_model(self): - """Raise an error if the component's model is not initialized.""" - if getattr(self, "model", None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): """Apply the pipe to a stream of documents. @@ -116,7 +99,6 @@ class Pipe(object): """Apply the pipeline's model to a batch of docs, without modifying them. """ - self.require_model() raise NotImplementedError def set_annotations(self, docs, scores, tensors=None): @@ -158,22 +140,23 @@ class Pipe(object): ): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" - if self.model is True: - self.model = self.Model(**self.cfg) + self.model.initialize() if hasattr(self, "vocab"): link_vectors_to_models(self.vocab) - self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd + def set_output(self, nO): + self.model.set_dim("nO", nO) + if self.model.has_ref("output_layer"): + self.model.get_ref("output_layer").set_dim("nO", nO) + def get_gradients(self): """Get non-zero gradients of the model's parameters, as a dictionary keyed by the parameter ID. The values are (weights, gradients) tuples. """ gradients = {} - if self.model in (None, True, False): - return gradients queue = [self.model] seen = set() for node in queue: @@ -199,8 +182,7 @@ class Pipe(object): """ serialize = {} serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - if self.model not in (True, False, None): - serialize["model"] = self.model.to_bytes + serialize["model"] = self.model.to_bytes if hasattr(self, "vocab"): serialize["vocab"] = self.vocab.to_bytes exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -210,20 +192,15 @@ class Pipe(object): """Load the pipe from a bytestring.""" def load_model(b): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - self.model = self.Model(**self.cfg) try: self.model.from_bytes(b) except AttributeError: raise ValueError(Errors.E149) deserialize = {} - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) if hasattr(self, "vocab"): deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) @@ -234,8 +211,7 @@ class Pipe(object): serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) - if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -243,19 +219,14 @@ class Pipe(object): """Load the pipe from disk.""" def load_model(p): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - self.model = self.Model(**self.cfg) try: self.model.from_bytes(p.open("rb").read()) except AttributeError: raise ValueError(Errors.E149) deserialize = {} - deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) @@ -266,31 +237,13 @@ class Pipe(object): class Tensorizer(Pipe): """Pre-train position-sensitive vectors for tokens.""" - @classmethod - def Model(cls, output_size=300, **cfg): - """Create a new statistical model for the class. - - width (int): Output size of the model. - embed_size (int): Number of vectors in the embedding table. - **cfg: Config parameters. - RETURNS (Model): A `thinc.model.Model` or similar instance. - """ - input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96)) - return Linear(output_size, input_size, init_W=zero_init) - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): """Construct a new statistical model. Weights are not allocated on initialisation. vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` instance with the `Doc` objects it will process. - model (Model): A `Model` instance or `True` to allocate one later. **cfg: Config parameters. - - EXAMPLE: - >>> from spacy.pipeline import TokenVectorEncoder - >>> tok2vec = TokenVectorEncoder(nlp.vocab) - >>> tok2vec.model = tok2vec.Model(128, 5000) """ self.vocab = vocab self.model = model @@ -337,7 +290,6 @@ class Tensorizer(Pipe): docs (iterable): A sequence of `Doc` objects. RETURNS (object): Vector representations for each token in the docs. """ - self.require_model() inputs = self.model.ops.flatten([doc.tensor for doc in docs]) outputs = self.model(inputs) return self.model.ops.unflatten(outputs, [len(d) for d in docs]) @@ -362,7 +314,6 @@ class Tensorizer(Pipe): sgd (callable): An optimizer. RETURNS (dict): Results from the update. """ - self.require_model() examples = Example.to_example_objects(examples) inputs = [] bp_inputs = [] @@ -405,10 +356,8 @@ class Tensorizer(Pipe): """ if pipeline is not None: for name, model in pipeline: - if getattr(model, "tok2vec", None): - self.input_models.append(model.tok2vec) - if self.model is True: - self.model = self.Model(**self.cfg) + if model.has_ref("tok2vec"): + self.input_models.append(model.get_ref("tok2vec")) self.model.initialize() link_vectors_to_models(self.vocab) if sgd is None: @@ -423,7 +372,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger """ - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None @@ -433,13 +382,6 @@ class Tagger(Pipe): def labels(self): return tuple(self.vocab.morphology.tag_names) - @property - def tok2vec(self): - if self.model in (None, True, False): - return None - else: - return chain(self.model.get_ref("tok2vec"), list2array()) - def __call__(self, example): doc = self._get_doc(example) tags = self.predict([doc]) @@ -465,7 +407,6 @@ class Tagger(Pipe): yield from docs def predict(self, docs): - self.require_model() if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.labels) @@ -513,7 +454,6 @@ class Tagger(Pipe): doc.is_tagged = True def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): - self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. @@ -600,52 +540,21 @@ class Tagger(Pipe): vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) - self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") - if self.model is True: - for hp in ["token_vector_width", "conv_depth"]: - if hp in kwargs: - self.cfg[hp] = kwargs[hp] - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + self.set_output(len(self.labels)) + self.model.initialize() # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. - n_tags = self.vocab.morphology.n_tags - for node in self.model.walk(): - # TODO: softmax hack ? - if node.name == "softmax" and node.has_dim("nO") is None: - node.set_dim("nO", n_tags) link_vectors_to_models(self.vocab) - self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd - @classmethod - def Model(cls, n_tags=None, **cfg): - if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"): - raise ValueError(TempErrors.T008) - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - config = { - "width": cfg.get("token_vector_width", 96), - "embed_size": cfg.get("embed_size", 2000), - "pretrained_vectors": cfg.get("pretrained_vectors", None), - "window_size": cfg.get("window_size", 1), - "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), - "subword_features": cfg.get("subword_features", True), - "char_embed": cfg.get("char_embed", False), - "conv_depth": cfg.get("conv_depth", 4), - "bilstm_depth": cfg.get("bilstm_depth", 0), - } - tok2vec = Tok2Vec(**config) - return build_tagger_model(n_tags, tok2vec) - def add_label(self, label, values=None): if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model not in (True, False, None): + if self.model.has_dim("nO"): # Here's how the model resizing will work, once the # neuron-to-tag mapping is no longer controlled by # the Morphology class, which sorts the tag names. @@ -672,8 +581,7 @@ class Tagger(Pipe): def to_bytes(self, exclude=tuple(), **kwargs): serialize = {} - if self.model not in (None, True, False): - serialize["model"] = self.model.to_bytes + serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) @@ -683,14 +591,6 @@ class Tagger(Pipe): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def load_model(b): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - token_vector_width = util.env_opt( - "token_vector_width", - self.cfg.get("token_vector_width", 96)) - self.model = self.Model(**self.cfg) try: self.model.from_bytes(b) except AttributeError: @@ -719,18 +619,13 @@ class Tagger(Pipe): "vocab": lambda p: self.vocab.to_disk(p), "tag_map": lambda p: srsly.write_msgpack(p, tag_map), "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg) + "cfg": lambda p: srsly.write_json(p, self.cfg), } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: - self.cfg["pretrained_vectors"] = self.vocab.vectors - if self.model is True: - self.model = self.Model(**self.cfg) with p.open("rb") as file_: try: self.model.from_bytes(file_.read()) @@ -745,8 +640,8 @@ class Tagger(Pipe): exc=self.vocab.morphology.exc) deserialize = { - "cfg": lambda p: self.cfg.update(_load_cfg(p)), "vocab": lambda p: self.vocab.from_disk(p), + "cfg": lambda p: self.cfg.update(_load_cfg(p)), "tag_map": load_tag_map, "model": load_model, } @@ -762,16 +657,11 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer """ - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault("cnn_maxout_pieces", 2) - self.cfg.setdefault("subword_features", True) - self.cfg.setdefault("token_vector_width", 12) - self.cfg.setdefault("conv_depth", 1) - self.cfg.setdefault("pretrained_vectors", None) @property def labels(self): @@ -797,7 +687,6 @@ class SentenceRecognizer(Tagger): doc.c[j].sent_start = -1 def update(self, examples, drop=0., sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. @@ -844,20 +733,12 @@ class SentenceRecognizer(Tagger): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): cdef Vocab vocab = self.vocab - if self.model is True: - for hp in ["token_vector_width", "conv_depth"]: - if hp in kwargs: - self.cfg[hp] = kwargs[hp] - self.model = self.Model(len(self.labels), **self.cfg) + self.set_output(len(self.labels)) + self.model.initialize() if sgd is None: sgd = self.create_optimizer() - self.model.initialize() return sgd - @classmethod - def Model(cls, n_tags, **cfg): - return build_tagger_model(n_tags, **cfg) - def add_label(self, label, values=None): raise NotImplementedError @@ -867,8 +748,7 @@ class SentenceRecognizer(Tagger): def to_bytes(self, exclude=tuple(), **kwargs): serialize = {} - if self.model not in (None, True, False): - serialize["model"] = self.model.to_bytes + serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -876,8 +756,6 @@ class SentenceRecognizer(Tagger): def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def load_model(b): - if self.model is True: - self.model = self.Model(len(self.labels), **self.cfg) try: self.model.from_bytes(b) except AttributeError: @@ -896,15 +774,13 @@ class SentenceRecognizer(Tagger): serialize = { "vocab": lambda p: self.vocab.to_disk(p), "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg) + "cfg": lambda p: srsly.write_json(p, self.cfg), } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): - if self.model is True: - self.model = self.Model(len(self.labels), **self.cfg) with p.open("rb") as file_: try: self.model.from_bytes(file_.read()) @@ -912,8 +788,8 @@ class SentenceRecognizer(Tagger): raise ValueError(Errors.E149) deserialize = { - "cfg": lambda p: self.cfg.update(_load_cfg(p)), "vocab": lambda p: self.vocab.from_disk(p), + "cfg": lambda p: self.cfg.update(_load_cfg(p)), "model": load_model, } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) @@ -927,7 +803,7 @@ class MultitaskObjective(Tagger): side-objective. """ - def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): + def __init__(self, vocab, model, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model if target == "dep": @@ -947,7 +823,8 @@ class MultitaskObjective(Tagger): else: raise ValueError(Errors.E016) self.cfg = dict(cfg) - self.cfg.setdefault("cnn_maxout_pieces", 2) + # TODO: remove - put in config + self.cfg.setdefault("maxout_pieces", 2) @property def labels(self): @@ -969,30 +846,15 @@ class MultitaskObjective(Tagger): label = self.make_label(i, example.token_annotation) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) - if self.model is True: - token_vector_width = util.env_opt("token_vector_width") - self.model = self.Model(len(self.labels), tok2vec=tok2vec) - link_vectors_to_models(self.vocab) self.model.initialize() + link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd - @classmethod - def Model(cls, n_tags, tok2vec=None, **cfg): - token_vector_width = util.env_opt("token_vector_width", 96) - model = chain( - tok2vec, - Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0), - LayerNorm(token_vector_width*2), - Softmax(nO=n_tags, nI=token_vector_width*2) - ) - return model - def predict(self, docs): - self.require_model() - tokvecs = self.model.tok2vec(docs) - scores = self.model.softmax(tokvecs) + tokvecs = self.model.get_ref("tok2vec")(docs) + scores = self.model.get_ref("softmax")(tokvecs) return tokvecs, scores def get_loss(self, examples, scores): @@ -1097,18 +959,7 @@ class MultitaskObjective(Tagger): class ClozeMultitask(Pipe): - @classmethod - def Model(cls, vocab, tok2vec, **cfg): - output_size = vocab.vectors.data.shape[1] - output_layer = chain( - Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0), - Linear(nO=output_size, nI=output_size, init_W=zero_init) - ) - model = chain(tok2vec, output_layer) - model = masked_language_model(vocab, model) - return model - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self.cfg = cfg @@ -1120,19 +971,16 @@ class ClozeMultitask(Pipe): def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): link_vectors_to_models(self.vocab) - if self.model is True: - self.model = self.Model(self.vocab, tok2vec) - X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.initialize() + X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) if sgd is None: sgd = self.create_optimizer() return sgd def predict(self, docs): - self.require_model() - tokvecs = self.model.tok2vec(docs) - vectors = self.model.output_layer(tokvecs) + tokvecs = self.model.get_ref("tok2vec")(docs) + vectors = self.model.get_ref("output_layer")(tokvecs) return tokvecs, vectors def get_loss(self, examples, vectors, prediction): @@ -1150,7 +998,6 @@ class ClozeMultitask(Pipe): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. @@ -1171,62 +1018,11 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer """ - - @classmethod - def Model(cls, nr_class=1, exclusive_classes=None, **cfg): - if nr_class == 1: - exclusive_classes = False - if exclusive_classes is None: - raise ValueError( - "TextCategorizer Model must specify 'exclusive_classes'. " - "This setting determines whether the model will output " - "scores that sum to 1 for each example. If only one class " - "is true for each example, you should set exclusive_classes=True. " - "For 'multi_label' classification, set exclusive_classes=False." - ) - if "embed_size" not in cfg: - cfg["embed_size"] = util.env_opt("embed_size", 2000) - if "token_vector_width" not in cfg: - cfg["token_vector_width"] = util.env_opt("token_vector_width", 96) - if cfg.get("architecture") == "bow": - return build_bow_text_classifier(nr_class, exclusive_classes, **cfg) - else: - if "tok2vec" in cfg: - tok2vec = cfg["tok2vec"] - else: - config = { - "width": cfg.get("token_vector_width", 96), - "embed_size": cfg.get("embed_size", 2000), - "pretrained_vectors": cfg.get("pretrained_vectors", None), - "window_size": cfg.get("window_size", 1), - "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3), - "subword_features": cfg.get("subword_features", True), - "char_embed": cfg.get("char_embed", False), - "conv_depth": cfg.get("conv_depth", 4), - "bilstm_depth": cfg.get("bilstm_depth", 0), - } - tok2vec = Tok2Vec(**config) - return build_simple_cnn_text_classifier( - tok2vec, - nr_class, - exclusive_classes, - **cfg - ) - - @property - def tok2vec(self): - if self.model in (None, True, False): - return None - else: - return self.model.tok2vec - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = dict(cfg) - if "exclusive_classes" not in cfg: - self.cfg["exclusive_classes"] = True @property def labels(self): @@ -1255,7 +1051,6 @@ class TextCategorizer(Pipe): yield from docs def predict(self, docs): - self.require_model() tensors = [doc.tensor for doc in docs] if not any(len(doc) for doc in docs): @@ -1274,7 +1069,6 @@ class TextCategorizer(Pipe): doc.cats[label] = float(scores[i, j]) def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if not any(len(ex.doc) if ex.doc else 0 for ex in examples): # Handle cases where there are no tokens in any docs. @@ -1311,7 +1105,7 @@ class TextCategorizer(Pipe): losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() - def get_loss(self, examples, scores): + def _examples_to_truth(self, examples): golds = [ex.gold for ex in examples] truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") @@ -1322,6 +1116,10 @@ class TextCategorizer(Pipe): else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) + return truths, not_missing + + def get_loss(self, examples, scores): + truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) d_scores = (scores-truths) / scores.shape[0] d_scores *= not_missing @@ -1333,7 +1131,7 @@ class TextCategorizer(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model not in (None, True, False): + if self.model.has_dim("nO"): # This functionality was available previously, but was broken. # The problem is that we resize the last layer, but the last layer # is actually just an ensemble. We're not resizing the child layers @@ -1348,19 +1146,18 @@ class TextCategorizer(Pipe): return 1 def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - for example in get_examples(): + # TODO: begin_training is not guaranteed to see all data / labels ? + examples = list(get_examples()) + for example in examples: for cat in example.doc_annotation.cats: self.add_label(cat) - if self.model is True: - self.cfg.update(kwargs) - self.require_labels() - self.model = self.Model(len(self.labels), **self.cfg) - link_vectors_to_models(self.vocab) + self.require_labels() + docs = [Doc(Vocab(), words=["hello"])] + truths, _ = self._examples_to_truth(examples) + self.set_output(len(self.labels)) + self.model.initialize(X=docs, Y=truths) if sgd is None: sgd = self.create_optimizer() - # TODO: use get_examples instead - docs = [Doc(Vocab(), words=["hello"])] - self.model.initialize(X=docs) return sgd @@ -1393,7 +1190,7 @@ cdef class DependencyParser(Parser): def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: - tok2vec = self.model.tok2vec + tok2vec = self.model.get_ref("tok2vec") labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec, sgd=sgd) @@ -1423,7 +1220,6 @@ cdef class EntityRecognizer(Parser): assigns = ["doc.ents", "token.ent_iob", "token.ent_type"] requires = [] TransitionSystem = BiluoPushDown - nr_feature = 6 def add_multitask_objective(self, target): if target == "cloze": @@ -1435,7 +1231,7 @@ cdef class EntityRecognizer(Parser): def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: - tok2vec = self.model.tok2vec + tok2vec = self.model.get_ref("tok2vec") labeller.begin_training(get_examples, pipeline=pipeline, tok2vec=tok2vec) @@ -1464,18 +1260,9 @@ class EntityLinker(Pipe): """ NIL = "NIL" # string used to refer to a non-existing link - @classmethod - def Model(cls, **cfg): - embed_width = cfg.get("embed_width", 300) - hidden_width = cfg.get("hidden_width", 128) - type_to_int = cfg.get("type_to_int", dict()) - - model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg) - return model - - def __init__(self, vocab, **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab - self.model = True + self.model = model self.kb = None self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) @@ -1483,11 +1270,6 @@ class EntityLinker(Pipe): def set_kb(self, kb): self.kb = kb - def require_model(self): - # Raise an error if the component's model is not initialized. - if getattr(self, "model", None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def require_kb(self): # Raise an error if the knowledge base is not initialized. if getattr(self, "kb", None) in (None, True, False): @@ -1495,16 +1277,14 @@ class EntityLinker(Pipe): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb() - self.cfg["entity_width"] = self.kb.entity_vector_length - if self.model is True: - self.model = self.Model(**self.cfg) + nO = self.kb.entity_vector_length + self.set_output(nO) self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None): - self.require_model() self.require_kb() if losses is not None: losses.setdefault(self.name, 0.0) @@ -1614,7 +1394,6 @@ class EntityLinker(Pipe): def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ - self.require_model() self.require_kb() entity_count = 0 @@ -1714,15 +1493,12 @@ class EntityLinker(Pipe): serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) - if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): - if self.model is True: - self.model = self.Model(**self.cfg) try: self.model.from_bytes(p.open("rb").read()) except AttributeError: @@ -1734,8 +1510,8 @@ class EntityLinker(Pipe): self.set_kb(kb) deserialize = {} - deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["kb"] = load_kb deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) @@ -1782,7 +1558,7 @@ class Sentencizer(Pipe): self.punct_chars = set(self.default_punct_chars) @classmethod - def from_nlp(cls, nlp, **cfg): + def from_nlp(cls, nlp, model=None, **cfg): return cls(**cfg) def __call__(self, example): @@ -1915,8 +1691,8 @@ class Sentencizer(Pipe): # Cython classes can't be decorated, so we need to add the factories here -Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg) -Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg) +Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg) +Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg) __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 8290468cf..a49f94ca3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -5,32 +5,21 @@ from ..gold import Example from ..tokens import Doc from ..vocab import Vocab from ..language import component -from ..util import link_vectors_to_models, minibatch, registry, eg2doc +from ..util import link_vectors_to_models, minibatch, eg2doc @component("tok2vec", assigns=["doc.tensor"]) class Tok2Vec(Pipe): - @classmethod - def from_nlp(cls, nlp, **cfg): - return cls(nlp.vocab, **cfg) @classmethod - def Model(cls, architecture, **cfg): - """Create a new statistical model for the class. + def from_nlp(cls, nlp, model, **cfg): + return cls(nlp.vocab, model, **cfg) - architecture (str): The registered model architecture to use. - **cfg: Config parameters. - RETURNS (Model): A `thinc.model.Model` or similar instance. - """ - model = registry.architectures.get(architecture) - return model(**cfg) - - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model, **cfg): """Construct a new statistical model. Weights are not allocated on initialisation. vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` instance with the `Doc` objects it will process. - model (Model): A `Model` instance or `True` to allocate one later. **cfg: Config parameters. """ self.vocab = vocab @@ -143,8 +132,6 @@ class Tok2Vec(Pipe): get_examples (function): Function returning example training data. pipeline (list): The pipeline the model is part of. """ - if self.model is True: - self.model = self.Model(**self.cfg) # TODO: use examples instead ? docs = [Doc(Vocab(), words=["hello"])] self.model.initialize(X=docs) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 442233f19..7ff9517a5 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -221,7 +221,10 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserModel(Model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): - Model.__init__(self, name="parser_model", forward=forward) + # don't define nO for this object, because we can't dynamically change it + Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None}) + if tok2vec.has_dim("nI"): + self.set_dim("nI", tok2vec.get_dim("nI")) self._layers = [tok2vec, lower_model] if upper_model is not None: self._layers.append(upper_model) @@ -229,6 +232,7 @@ class ParserModel(Model): if unseen_classes: for class_ in unseen_classes: self.unseen_classes.add(class_) + self.set_ref("tok2vec", tok2vec) def predict(self, docs): step_model = ParserStepModel(docs, self._layers, @@ -238,25 +242,32 @@ class ParserModel(Model): def resize_output(self, new_nO): if len(self._layers) == 2: return - if new_nO == self.upper.get_dim("nO"): + if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")): return smaller = self.upper - nI = smaller.get_dim("nI") + nI = None + if smaller.has_dim("nI"): + nI = smaller.get_dim("nI") with use_ops('numpy'): - larger = Linear(new_nO, nI) - larger_W = larger.ops.alloc2f(new_nO, nI) - larger_b = larger.ops.alloc1f(new_nO) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - # Weights are stored in (nr_out, nr_in) format, so we're basically - # just adding rows here. - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) + larger = Linear(nO=new_nO, nI=nI) + larger._init = smaller._init + # it could be that the model is not initialized yet, then skip this bit + if nI: + larger_W = larger.ops.alloc2f(new_nO, nI) + larger_b = larger.ops.alloc1f(new_nO) + smaller_W = smaller.get_param("W") + smaller_b = smaller.get_param("b") + # Weights are stored in (nr_out, nr_in) format, so we're basically + # just adding rows here. + if smaller.has_dim("nO"): + larger_W[:smaller.get_dim("nO")] = smaller_W + larger_b[:smaller.get_dim("nO")] = smaller_b + for i in range(smaller.get_dim("nO"), new_nO): + self.unseen_classes.add(i) + + larger.set_param("W", larger_W) + larger.set_param("b", larger_b) self._layers[-1] = larger - for i in range(smaller.get_dim("nO"), new_nO): - self.unseen_classes.add(i) def initialize(self, X=None, Y=None): self.tok2vec.initialize() @@ -412,7 +423,7 @@ cdef class precompute_hiddens: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. """ - cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc + cdef readonly int nF, nO, nP cdef bint _is_synchronized cdef public object ops cdef np.ndarray _features @@ -458,6 +469,16 @@ cdef class precompute_hiddens: self._is_synchronized = True return self._cached.data + def has_dim(self, name): + if name == "nF": + return self.nF if self.nF is not None else True + elif name == "nP": + return self.nP if self.nP is not None else True + elif name == "nO": + return self.nO if self.nO is not None else True + else: + return False + def get_dim(self, name): if name == "nF": return self.nF @@ -468,6 +489,16 @@ cdef class precompute_hiddens: else: raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") + def set_dim(self, name, value): + if name == "nF": + self.nF = value + elif name == "nP": + self.nP = value + elif name == "nO": + self.nO = value + else: + raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") + def __call__(self, X, bint is_train): if is_train: return self.begin_update(X) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index cf57e1cf6..9381fab6b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -27,11 +27,11 @@ from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model import ParserModel -from ..util import link_vectors_to_models, create_default_optimizer +from ..util import link_vectors_to_models, create_default_optimizer, registry from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, user_warning, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -41,114 +41,42 @@ from . import _beam_utils from . import nonproj -from ..ml._layers import PrecomputableAffine -from ..ml.component_models import Tok2Vec - - cdef class Parser: """ Base class of the DependencyParser and EntityRecognizer. """ - @classmethod - def Model(cls, nr_class, **cfg): - depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) - subword_features = util.env_opt('subword_features', - cfg.get('subword_features', True)) - conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4)) - conv_window = util.env_opt('conv_window', cfg.get('conv_window', 1)) - t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) - bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) - self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) - nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature) - if depth not in (0, 1): - raise ValueError(TempErrors.T004.format(value=depth)) - parser_maxout_pieces = util.env_opt('parser_maxout_pieces', - cfg.get('maxout_pieces', 2)) - token_vector_width = util.env_opt('token_vector_width', - cfg.get('token_vector_width', 96)) - hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) - if depth == 0: - hidden_width = nr_class - parser_maxout_pieces = 1 - embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) - pretrained_vectors = cfg.get('pretrained_vectors', None) - tok2vec = Tok2Vec(width=token_vector_width, - embed_size=embed_size, - conv_depth=conv_depth, - window_size=conv_window, - cnn_maxout_pieces=t2v_pieces, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors, - bilstm_depth=bilstm_depth) - tok2vec = chain(tok2vec, list2array()) - tok2vec.set_dim("nO", token_vector_width) - lower = PrecomputableAffine(hidden_width, - nF=nr_feature_tokens, nI=token_vector_width, - nP=parser_maxout_pieces) - lower.set_dim("nP", parser_maxout_pieces) - if depth == 1: - with use_ops('numpy'): - upper = Linear(nr_class, hidden_width, init_W=zero_init) - else: - upper = None - - cfg = { - 'nr_class': nr_class, - 'nr_feature_tokens': nr_feature_tokens, - 'hidden_depth': depth, - 'token_vector_width': token_vector_width, - 'hidden_width': hidden_width, - 'maxout_pieces': parser_maxout_pieces, - 'pretrained_vectors': pretrained_vectors, - 'bilstm_depth': bilstm_depth, - 'self_attn_depth': self_attn_depth, - 'conv_depth': conv_depth, - 'window_size': conv_window, - 'embed_size': embed_size, - 'cnn_maxout_pieces': t2v_pieces - } - model = ParserModel(tok2vec, lower, upper) - model.initialize() - return model, cfg - name = 'base_parser' - def __init__(self, Vocab vocab, moves=True, model=True, **cfg): + + def __init__(self, Vocab vocab, model, **cfg): """Create a Parser. vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. The value is set to the `.vocab` attribute. - moves (TransitionSystem): Defines how the parse-state is created, - updated and evaluated. The value is set to the .moves attribute - unless True (default), in which case a new instance is created with - `Parser.Moves()`. - model (object): Defines how the parse-state is created, updated and - evaluated. The value is set to the .model attribute. If set to True - (default), a new instance will be created with `Parser.Model()` - in parser.begin_training(), parser.from_disk() or parser.from_bytes(). - **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute + **cfg: Configuration parameters. Set to the `.cfg` attribute. + If it doesn't include a value for 'moves', a new instance is + created with `self.TransitionSystem()`. This defines how the + parse-state is created, updated and evaluated. """ self.vocab = vocab - if moves is True: - self.moves = self.TransitionSystem(self.vocab.strings) - else: - self.moves = moves - if 'beam_width' not in cfg: - cfg['beam_width'] = util.env_opt('beam_width', 1) - if 'beam_density' not in cfg: - cfg['beam_density'] = util.env_opt('beam_density', 0.0) - if 'beam_update_prob' not in cfg: - cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0) - cfg.setdefault('cnn_maxout_pieces', 3) - cfg.setdefault("nr_feature_tokens", self.nr_feature) - self.cfg = cfg + moves = cfg.get("moves", None) + if moves is None: + # defined by EntityRecognizer as a BiluoPushDown + moves = self.TransitionSystem(self.vocab.strings) + self.moves = moves + cfg.setdefault('min_action_freq', 30) + cfg.setdefault('learn_tokens', False) + cfg.setdefault('beam_width', 1) + cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used) self.model = model + self.set_output(self.moves.n_moves) + self.cfg = cfg self._multitasks = [] self._rehearsal_model = None @classmethod - def from_nlp(cls, nlp, **cfg): - return cls(nlp.vocab, **cfg) + def from_nlp(cls, nlp, model, **cfg): + return cls(nlp.vocab, model, **cfg) def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) @@ -163,8 +91,6 @@ cdef class Parser: names.append(name) return names - nr_feature = 8 - @property def labels(self): class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] @@ -173,7 +99,7 @@ cdef class Parser: @property def tok2vec(self): '''Return the embedding and convolutional layer of the model.''' - return None if self.model in (None, True, False) else self.model.tok2vec + return self.model.tok2vec @property def postprocesses(self): @@ -190,10 +116,7 @@ cdef class Parser: self._resize() def _resize(self): - if "nr_class" in self.cfg: - self.cfg["nr_class"] = self.moves.n_moves - if self.model not in (True, False, None): - self.model.resize_output(self.moves.n_moves) + self.model.resize_output(self.moves.n_moves) if self._rehearsal_model not in (True, False, None): self._rehearsal_model.resize_output(self.moves.n_moves) @@ -227,7 +150,7 @@ cdef class Parser: doc (Doc): The document to be processed. """ if beam_width is None: - beam_width = self.cfg.get('beam_width', 1) + beam_width = self.cfg['beam_width'] beam_density = self.cfg.get('beam_density', 0.) states = self.predict([doc], beam_width=beam_width, beam_density=beam_density) @@ -243,7 +166,7 @@ cdef class Parser: YIELDS (Doc): Documents, in order. """ if beam_width is None: - beam_width = self.cfg.get('beam_width', 1) + beam_width = self.cfg['beam_width'] beam_density = self.cfg.get('beam_density', 0.) cdef Doc doc for batch in util.minibatch(docs, size=batch_size): @@ -264,13 +187,7 @@ cdef class Parser: else: yield from batch_in_order - def require_model(self): - """Raise an error if the component's model is not initialized.""" - if getattr(self, 'model', None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.): - self.require_model() if isinstance(docs, Doc): docs = [docs] if not any(len(doc) for doc in docs): @@ -313,11 +230,11 @@ cdef class Parser: # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() + cdef int nr_feature = self.model.lower.get_dim("nF") model = self.model.predict(docs) - token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), + token_ids = numpy.zeros((len(docs) * beam_width, nr_feature), dtype='i', order='C') cdef int* c_ids - cdef int nr_feature = self.cfg["nr_feature_tokens"] cdef int n_states model = self.model.predict(docs) todo = [beam for beam in beams if not beam.is_done] @@ -430,7 +347,6 @@ cdef class Parser: return [b for b in beams if not b.is_done] def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): - self.require_model() examples = Example.to_example_objects(examples) if losses is None: @@ -440,9 +356,9 @@ cdef class Parser: multitask.update(examples, drop=drop, sgd=sgd) # The probability we use beam update, instead of falling back to # a greedy update - beam_update_prob = self.cfg.get('beam_update_prob', 0.5) - if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam(examples, self.cfg.get('beam_width', 1), + beam_update_prob = self.cfg['beam_update_prob'] + if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob: + return self.update_beam(examples, self.cfg['beam_width'], drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations, beam_density=self.cfg.get('beam_density', 0.001)) @@ -533,7 +449,7 @@ cdef class Parser: set_dropout_rate(self.model, drop) model, backprop_tok2vec = self.model.begin_update(docs) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, + self.moves, self.model.lower.get_dim("nF"), 10000, states, golds, model.state2vec, model.vec2scores, width, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): @@ -562,8 +478,6 @@ cdef class Parser: keyed by the parameter ID. The values are (weights, gradients) tuples. """ gradients = {} - if self.model in (None, True, False): - return gradients queue = [self.model] seen = set() for node in queue: @@ -647,45 +561,40 @@ cdef class Parser: def create_optimizer(self): return create_default_optimizer() - def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg): - if 'model' in cfg: - self.model = cfg['model'] + def set_output(self, nO): + if self.model.upper.has_dim("nO") is None: + self.model.upper.set_dim("nO", nO) + + def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): + self.cfg.update(kwargs) if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples - cfg.setdefault('min_action_freq', 30) actions = self.moves.get_actions(gold_parses=get_examples(), - min_freq=cfg.get('min_action_freq', 30), - learn_tokens=self.cfg.get("learn_tokens", False)) + min_freq=self.cfg['min_action_freq'], + learn_tokens=self.cfg["learn_tokens"]) for action, labels in self.moves.labels.items(): actions.setdefault(action, {}) for label, freq in labels.items(): if label not in actions[action]: actions[action][label] = freq self.moves.initialize_actions(actions) - cfg.setdefault('token_vector_width', 96) - if self.model is True: - self.model, cfg = self.Model(self.moves.n_moves, **cfg) - if sgd is None: - sgd = self.create_optimizer() - doc_sample = [] - gold_sample = [] - for example in islice(get_examples(), 1000): - parses = example.get_gold_parses(merge=False, vocab=self.vocab) - for doc, gold in parses: - doc_sample.append(doc) - gold_sample.append(gold) - self.model.initialize(doc_sample, gold_sample) - if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg) - link_vectors_to_models(self.vocab) - else: - if sgd is None: - sgd = self.create_optimizer() - if self.model.upper.has_dim("nO") is None: - self.model.upper.set_dim("nO", self.moves.n_moves) - self.model.initialize() - self.cfg.update(cfg) + # make sure we resize so we have an appropriate upper layer + self._resize() + if sgd is None: + sgd = self.create_optimizer() + doc_sample = [] + gold_sample = [] + for example in islice(get_examples(), 1000): + parses = example.get_gold_parses(merge=False, vocab=self.vocab) + for doc, gold in parses: + doc_sample.append(doc) + gold_sample.append(gold) + + self.model.initialize(doc_sample, gold_sample) + if pipeline is not None: + self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) + link_vectors_to_models(self.vocab) return sgd def _get_doc(self, example): @@ -709,28 +618,24 @@ cdef class Parser: 'vocab': lambda p: self.vocab.from_disk(p), 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), - 'model': lambda p: None + 'model': lambda p: None, } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_disk(path, deserializers, exclude) if 'model' not in exclude: path = util.ensure_path(path) - if self.model is True: - self.model, cfg = self.Model(**self.cfg) - else: - cfg = {} with (path / 'model').open('rb') as file_: bytes_data = file_.read() try: + self._resize() self.model.from_bytes(bytes_data) except AttributeError: raise ValueError(Errors.E149) - self.cfg.update(cfg) return self def to_bytes(self, exclude=tuple(), **kwargs): serializers = { - "model": lambda: (self.model.to_bytes() if self.model is not True else True), + "model": lambda: (self.model.to_bytes()), "vocab": lambda: self.vocab.to_bytes(), "moves": lambda: self.moves.to_bytes(exclude=["strings"]), "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) @@ -743,22 +648,14 @@ cdef class Parser: "vocab": lambda b: self.vocab.from_bytes(b), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: None + "model": lambda b: None, } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: - # TODO: Remove this once we don't have to handle previous models - if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors - if self.model is True: - self.model, cfg = self.Model(**self.cfg) - else: - cfg = {} if 'model' in msg: try: self.model.from_bytes(msg['model']) except AttributeError: raise ValueError(Errors.E149) - self.cfg.update(cfg) return self diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 766dcb739..3a466b24c 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -3,12 +3,13 @@ from spacy.tokens import Span import pytest from ..util import get_doc +from ...ml.models.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab) + ner = EntityRecognizer(en_vocab, default_ner()) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 @@ -24,7 +25,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab) + ner = EntityRecognizer(en_vocab, default_ner()) ner.begin_training([]) ner(doc) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fe847a6ae..5af772ddc 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -3,6 +3,8 @@ from thinc.api import Adam, NumpyOps from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab + +from spacy.ml.models.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.util import fix_random_seed @@ -15,7 +17,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab) + parser = DependencyParser(vocab, default_parser()) return parser @@ -55,27 +57,31 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - ner1 = EntityRecognizer(Vocab()) + ner1 = EntityRecognizer(Vocab(), default_ner()) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.begin_training([]) - ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes()) + ner2 = EntityRecognizer(Vocab(), default_ner()) + + # the second model needs to be resized before we can call from_bytes + ner2.model.resize_output(ner1.moves.n_moves) + ner2.from_bytes(ner1.to_bytes()) assert ner1.moves.n_moves == ner2.moves.n_moves for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i) @pytest.mark.parametrize( - "pipe_cls,n_moves", [(DependencyParser, 5), (EntityRecognizer, 4)] + "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())] ) -def test_add_label_get_label(pipe_cls, n_moves): +def test_add_label_get_label(pipe_cls, n_moves, model): """Test that added labels are returned correctly. This test was added to test for a bug in DependencyParser.labels that'd cause it to fail when splitting the move names. """ labels = ["A", "B", "C"] - pipe = pipe_cls(Vocab()) + pipe = pipe_cls(Vocab(), model) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index dd593f7d3..2426805d2 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,5 +1,7 @@ import pytest from spacy.vocab import Vocab + +from spacy.ml.models.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.tokens import Doc from spacy.gold import GoldParse @@ -136,7 +138,7 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - parser = DependencyParser(doc.vocab) + parser = DependencyParser(doc.vocab, default_parser()) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 9a4d21a8d..3fde75eb5 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,10 +1,15 @@ import pytest + +from spacy import util from spacy.lang.en import English +from spacy.ml.models.defaults import default_ner from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown from spacy.gold import GoldParse + +from spacy.tests.util import make_tempdir from spacy.tokens import Doc TRAIN_DATA = [ @@ -134,7 +139,7 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - ner1 = EntityRecognizer(doc1.vocab) + ner1 = EntityRecognizer(doc1.vocab, default_ner()) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -152,7 +157,7 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - ner2 = EntityRecognizer(doc2.vocab) + ner2 = EntityRecognizer(doc2.vocab, default_ner()) # set "New York" to a blocked entity doc2.ents = [(0, 3, 5)] @@ -188,7 +193,7 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - ner2 = EntityRecognizer(doc.vocab) + ner2 = EntityRecognizer(doc.vocab, default_ner()) ner2.moves.add_action(5, "") ner2.add_label("GPE") state = ner2.moves.init_batch([doc])[0] @@ -199,6 +204,17 @@ def test_overwrite_token(): assert ner2.moves.is_valid(state, "L-GPE") +def test_empty_ner(): + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("MY_LABEL") + nlp.add_pipe(ner) + nlp.begin_training() + doc = nlp("John is watching the news about Croatia's elections") + # if this goes wrong, the initialization of the parser's upper layer is probably broken + assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + + def test_ruler_before_ner(): """ Test that an NER works after an entity_ruler: the second can add annotations """ nlp = English() @@ -214,7 +230,6 @@ def test_ruler_before_ner(): untrained_ner.add_label("MY_LABEL") nlp.add_pipe(untrained_ner) nlp.begin_training() - doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] @@ -261,28 +276,7 @@ def test_block_ner(): assert [token.ent_type_ for token in doc] == expected_types -def test_change_number_features(): - # Test the default number features - nlp = English() - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - ner.add_label("PERSON") - nlp.begin_training() - assert ner.model.lower.get_dim("nF") == ner.nr_feature - # Test we can change it - nlp = English() - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - ner.add_label("PERSON") - nlp.begin_training( - component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}} - ) - assert ner.model.lower.get_dim("nF") == 3 - # Test the model runs - nlp("hello world") - - -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly nlp = English() ner = nlp.create_pipe("ner") @@ -301,11 +295,20 @@ def test_overfitting(): test_text = "I like London." doc = nlp(test_text) ents = doc.ents - assert len(ents) == 1 assert ents[0].text == "London" assert ents[0].label_ == "LOC" + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + ents2 = doc2.ents + assert len(ents2) == 1 + assert ents2[0].text == "London" + assert ents2[0].label_ == "LOC" + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 2470982d3..984af4d6b 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,8 +1,9 @@ import pytest -from spacy.ml.component_models import Tok2Vec +from spacy.ml.models.defaults import default_parser, default_tok2vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser +from spacy.syntax._parser_model import ParserModel from spacy.tokens.doc import Doc from spacy.gold import GoldParse @@ -20,19 +21,22 @@ def arc_eager(vocab): @pytest.fixture def tok2vec(): - tok2vec = Tok2Vec(8, 100) + tok2vec = default_tok2vec() tok2vec.initialize() return tok2vec @pytest.fixture def parser(vocab, arc_eager): - return Parser(vocab, moves=arc_eager, model=None) + return Parser(vocab, model=default_parser(), moves=arc_eager) @pytest.fixture -def model(arc_eager, tok2vec): - return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0] +def model(arc_eager, tok2vec, vocab): + model = default_parser() + model.resize_output(arc_eager.n_moves) + model.initialize() + return model @pytest.fixture @@ -46,11 +50,11 @@ def gold(doc): def test_can_init_nn_parser(parser): - assert parser.model is None + assert isinstance(parser.model, ParserModel) -def test_build_model(parser): - parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0] +def test_build_model(parser, vocab): + parser.model = Parser(vocab, model=default_parser(), moves=parser.moves).model assert parser.model is not None diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 24997e47c..619e0cc0b 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -2,6 +2,7 @@ import pytest import numpy from spacy.vocab import Vocab from spacy.language import Language +from spacy.ml.models.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.syntax.arc_eager import ArcEager from spacy.tokens import Doc @@ -93,7 +94,7 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") + nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 1d3f522c9..6e13d3044 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,7 +1,8 @@ import pytest from spacy.lang.en import English -from ..util import get_doc, apply_transition_sequence +from ..util import get_doc, apply_transition_sequence, make_tempdir +from ... import util TRAIN_DATA = [ ( @@ -182,7 +183,7 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly nlp = English() parser = nlp.create_pipe("parser") @@ -200,7 +201,15 @@ def test_overfitting(): # test the trained model test_text = "I like securities." doc = nlp(test_text) - assert doc[0].dep_ is "nsubj" assert doc[2].dep_ is "dobj" assert doc[3].dep_ is "punct" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + assert doc2[0].dep_ is "nsubj" + assert doc2[2].dep_ is "dobj" + assert doc2[3].dep_ is "punct" diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index c6c1240a8..af777aa6b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -3,6 +3,8 @@ from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab + +from spacy.ml.models.defaults import default_parser from spacy.tokens import Doc from spacy.pipeline import DependencyParser @@ -14,7 +16,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab) + parser = DependencyParser(vocab, default_parser()) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 5c246538c..cda39f6ee 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -111,7 +111,8 @@ def test_component_factories_from_nlp(): nlp.add_pipe(pipe) assert nlp("hello world") # The first argument here is the class itself, so we're accepting any here - mock.assert_called_once_with(ANY, nlp, foo="bar") + # The model will be initialized to None by the factory + mock.assert_called_once_with(ANY, nlp, None, foo="bar") def test_analysis_validate_attrs_valid(): diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 366cd4f1a..a90207a78 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,5 +1,9 @@ import pytest + +from spacy import util +from spacy.lang.en import English from spacy.language import Language +from spacy.tests.util import make_tempdir def test_label_types(): @@ -18,9 +22,9 @@ TRAIN_DATA = [ ] -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly - nlp = Language() + nlp = English() tagger = nlp.create_pipe("tagger") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) @@ -35,8 +39,17 @@ def test_overfitting(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - assert doc[0].tag_ is "N" assert doc[1].tag_ is "V" assert doc[2].tag_ is "J" assert doc[3].tag_ is "N" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + assert doc2[0].tag_ is "N" + assert doc2[1].tag_ is "V" + assert doc2[2].tag_ is "J" + assert doc2[3].tag_ is "N" diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 558d09e40..1b5ca9a4c 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,8 +1,12 @@ import pytest import random import numpy.random + +from spacy import util +from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer +from spacy.tests.util import make_tempdir from spacy.tokens import Doc from spacy.gold import GoldParse @@ -74,9 +78,9 @@ def test_label_types(): nlp.get_pipe("textcat").add_label(9) -def test_overfitting(): +def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly - nlp = Language() + nlp = English() textcat = nlp.create_pipe("textcat") for _, annotations in TRAIN_DATA: for label, value in annotations.get("cats").items(): @@ -87,11 +91,21 @@ def test_overfitting(): for i in range(50): losses = {} nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) - assert losses["textcat"] < 0.00001 + assert losses["textcat"] < 0.01 # test the trained model test_text = "I am happy." doc = nlp(test_text) cats = doc.cats + # note that by default, exclusive_classes = false so we need a bigger error margin assert cats["POSITIVE"] > 0.9 - assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) + assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1) + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + cats2 = doc2.cats + assert cats2["POSITIVE"] > 0.9 + assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 2bfdbd7c3..ff8c7c2fe 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import is_stop from spacy.vectors import Vectors from spacy.vocab import Vocab from spacy.language import Language +from spacy.ml.models.defaults import default_ner, default_tagger from spacy.tokens import Doc, Span, Token from spacy.pipeline import Tagger, EntityRecognizer from spacy.attrs import HEAD, DEP @@ -123,7 +124,7 @@ def test_issue1727(): correctly after vectors are added.""" data = numpy.ones((3, 300), dtype="f") vectors = Vectors(data=data, keys=["I", "am", "Matt"]) - tagger = Tagger(Vocab()) + tagger = Tagger(Vocab(), default_tagger()) tagger.add_label("PRP") with pytest.warns(UserWarning): tagger.begin_training() @@ -131,7 +132,7 @@ def test_issue1727(): tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) - tagger = Tagger(Vocab()).from_disk(path) + tagger = Tagger(Vocab(), default_tagger()).from_disk(path) assert tagger.cfg.get("pretrained_dims", 0) == 0 @@ -236,6 +237,7 @@ def test_issue1889(word): assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) +@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3") def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() @@ -268,7 +270,7 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - ner = EntityRecognizer(Vocab()) + ner = EntityRecognizer(Vocab(), default_ner()) example = Example(doc=None) example.set_token_annotation( ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 2c25b6d73..1786677e0 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -32,6 +32,9 @@ def test_issue2179(): nlp.begin_training() nlp2 = Italian() nlp2.add_pipe(nlp2.create_pipe("ner")) + + assert len(nlp2.get_pipe("ner").labels) == 0 + nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index cc893e472..df23efa4f 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -1,6 +1,7 @@ import pytest from spacy.lang.en import English from spacy.lang.de import German +from spacy.ml.models.defaults import default_ner from spacy.pipeline import EntityRuler, EntityRecognizer from spacy.matcher import Matcher, PhraseMatcher from spacy.tokens import Doc @@ -103,6 +104,7 @@ def test_issue3209(): assert ner.move_names == move_names nlp2 = English() nlp2.add_pipe(nlp2.create_pipe("ner")) + nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert nlp2.get_pipe("ner").move_names == move_names @@ -193,7 +195,7 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - ner = EntityRecognizer(doc.vocab) + ner = EntityRecognizer(doc.vocab, default_ner()) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 54ce10924..9752f70df 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -1,10 +1,12 @@ from spacy.pipeline.pipes import DependencyParser from spacy.vocab import Vocab +from spacy.ml.models.defaults import default_parser + def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - parser = DependencyParser(Vocab()) + parser = DependencyParser(Vocab(), default_parser()) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) @@ -13,7 +15,7 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - parser = DependencyParser(Vocab(), learn_tokens=True) + parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 6644a8eda..75a1c23b7 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -3,6 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English from spacy.tokens import Span from spacy.util import ensure_path +from spacy.ml.models.defaults import default_ner from ..util import make_tempdir @@ -73,6 +74,6 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - ner2 = EntityRecognizer(vocab) + ner2 = EntityRecognizer(vocab, default_ner()) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index a3f6f69df..30688601f 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,5 +1,6 @@ from collections import defaultdict +from spacy.ml.models.defaults import default_ner from spacy.pipeline import EntityRecognizer from spacy.lang.en import English @@ -11,7 +12,7 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - ner = EntityRecognizer(nlp.vocab) + ner = EntityRecognizer(nlp.vocab, default_ner()) ner.add_label("SOME_LABEL") ner.begin_training([]) nlp.add_pipe(ner) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py new file mode 100644 index 000000000..c34d01547 --- /dev/null +++ b/spacy/tests/serialize/test_serialize_config.py @@ -0,0 +1,126 @@ +from thinc.api import Config + +import spacy +from spacy import util +from spacy.lang.en import English +from spacy.util import registry + +from ..util import make_tempdir +from ...ml.models import build_Tok2Vec_model, build_tb_parser_model + +nlp_config_string = """ +[nlp] +lang = "en" + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} +""" + + +parser_config_string = """ +[model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 99 +hidden_width = 66 +maxout_pieces = 2 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 333 +depth = 4 +embed_size = 5555 +window_size = 1 +maxout_pieces = 7 +subword_features = false +""" + + +@registry.architectures.register("my_test_parser") +def my_parser(): + tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3, + maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8, + conv_depth=2, bilstm_depth=0) + parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5) + return parser + + +def test_serialize_nlp(): + """ Create a custom nlp pipeline from config and ensure it serializes it correctly """ + nlp_config = Config().from_str(nlp_config_string) + nlp = util.load_model_from_config(nlp_config["nlp"]) + nlp.begin_training() + assert "tok2vec" in nlp.pipe_names + assert "tagger" in nlp.pipe_names + assert "parser" not in nlp.pipe_names + assert nlp.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342 + + with make_tempdir() as d: + nlp.to_disk(d) + nlp2 = spacy.load(d) + assert "tok2vec" in nlp2.pipe_names + assert "tagger" in nlp2.pipe_names + assert "parser" not in nlp2.pipe_names + assert nlp2.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342 + + +def test_serialize_custom_nlp(): + """ Create a custom nlp pipeline and ensure it serializes it correctly""" + nlp = English() + parser_cfg = dict() + parser_cfg["model"] = {'@architectures': "my_test_parser"} + parser = nlp.create_pipe("parser", parser_cfg) + nlp.add_pipe(parser) + nlp.begin_training() + + with make_tempdir() as d: + nlp.to_disk(d) + nlp2 = spacy.load(d) + model = nlp2.get_pipe("parser").model + tok2vec = model.get_ref("tok2vec") + upper = model.upper + + # check that we have the correct settings, not the default ones + assert tok2vec.get_dim("nO") == 321 + assert upper.get_dim("nI") == 65 + + +def test_serialize_parser(): + """ Create a non-default parser config to check nlp serializes it correctly """ + nlp = English() + model_config = Config().from_str(parser_config_string) + parser = nlp.create_pipe("parser", config=model_config) + parser.add_label("nsubj") + nlp.add_pipe(parser) + nlp.begin_training() + + with make_tempdir() as d: + nlp.to_disk(d) + nlp2 = spacy.load(d) + model = nlp2.get_pipe("parser").model + tok2vec = model.get_ref("tok2vec") + upper = model.upper + + # check that we have the correct settings, not the default ones + assert upper.get_dim("nI") == 66 + assert tok2vec.get_dim("nO") == 333 diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 4089a0d07..0e3b7c59f 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,5 +1,6 @@ import pytest import re + from spacy.language import Language from spacy.tokenizer import Tokenizer @@ -56,7 +57,7 @@ def test_serialize_language_exclude(meta_data): nlp = Language(meta=meta_data) assert nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes()) - assert nlp.meta["name"] == name + assert new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"]) assert not new_nlp.meta["name"] == name new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 0ad9bc4d4..fe14fba10 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,6 +1,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer +from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec from ..util import make_tempdir @@ -10,58 +11,58 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - parser = DependencyParser(en_vocab) + parser = DependencyParser(en_vocab, default_parser()) parser.add_label("nsubj") - parser.model, cfg = parser.Model(parser.moves.n_moves) - parser.cfg.update(cfg) return parser @pytest.fixture def blank_parser(en_vocab): - parser = DependencyParser(en_vocab) + parser = DependencyParser(en_vocab, default_parser()) return parser @pytest.fixture def taggers(en_vocab): - tagger1 = Tagger(en_vocab) - tagger2 = Tagger(en_vocab) - tagger1.model = tagger1.Model(8) - tagger2.model = tagger1.model - return (tagger1, tagger2) + model = default_tagger() + tagger1 = Tagger(en_vocab, model) + tagger2 = Tagger(en_vocab, model) + return tagger1, tagger2 @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): - parser = Parser(en_vocab) - parser.model, _ = parser.Model(10) - new_parser = Parser(en_vocab) - new_parser.model, _ = new_parser.Model(10) + parser = Parser(en_vocab, default_parser()) + new_parser = Parser(en_vocab, default_parser()) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) - assert new_parser.to_bytes(exclude=["vocab"]) == parser.to_bytes(exclude=["vocab"]) + bytes_2 = new_parser.to_bytes(exclude=["vocab"]) + bytes_3 = parser.to_bytes(exclude=["vocab"]) + assert len(bytes_2) == len(bytes_3) + assert bytes_2 == bytes_3 @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): - parser = Parser(en_vocab) - parser.model, _ = parser.Model(0) + parser = Parser(en_vocab, default_parser()) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) - parser_d = Parser(en_vocab) - parser_d.model, _ = parser_d.Model(0) + parser_d = Parser(en_vocab, default_parser()) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) + assert len(parser_bytes) == len(parser_d_bytes) assert parser_bytes == parser_d_bytes def test_to_from_bytes(parser, blank_parser): assert parser.model is not True - assert blank_parser.model is True + assert blank_parser.model is not True assert blank_parser.moves.n_moves != parser.moves.n_moves bytes_data = parser.to_bytes(exclude=["vocab"]) + + # the blank parser needs to be resized before we can call from_bytes + blank_parser.model.resize_output(parser.moves.n_moves) blank_parser.from_bytes(bytes_data) assert blank_parser.model is not True assert blank_parser.moves.n_moves == parser.moves.n_moves @@ -75,8 +76,10 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1_b = tagger1.to_bytes() tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b - new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) - assert new_tagger1.to_bytes() == tagger1_b + new_tagger1 = Tagger(en_vocab, default_tagger()).from_bytes(tagger1_b) + new_tagger1_b = new_tagger1.to_bytes() + assert len(new_tagger1_b) == len(tagger1_b) + assert new_tagger1_b == tagger1_b def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): @@ -86,26 +89,24 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): file_path2 = d / "tagger2" tagger1.to_disk(file_path1) tagger2.to_disk(file_path2) - tagger1_d = Tagger(en_vocab).from_disk(file_path1) - tagger2_d = Tagger(en_vocab).from_disk(file_path2) + tagger1_d = Tagger(en_vocab, default_tagger()).from_disk(file_path1) + tagger2_d = Tagger(en_vocab, default_tagger()).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes() def test_serialize_tensorizer_roundtrip_bytes(en_vocab): - tensorizer = Tensorizer(en_vocab) - tensorizer.model = tensorizer.Model() + tensorizer = Tensorizer(en_vocab, default_tensorizer()) tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) - new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b) + new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b) assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b def test_serialize_tensorizer_roundtrip_disk(en_vocab): - tensorizer = Tensorizer(en_vocab) - tensorizer.model = tensorizer.Model() + tensorizer = Tensorizer(en_vocab, default_tensorizer()) with make_tempdir() as d: file_path = d / "tensorizer" tensorizer.to_disk(file_path) - tensorizer_d = Tensorizer(en_vocab).from_disk(file_path) + tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path) assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( exclude=["vocab"] ) @@ -113,19 +114,17 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): def test_serialize_textcat_empty(en_vocab): # See issue #1105 - textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) + textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]) textcat.to_bytes(exclude=["vocab"]) @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_pipe_exclude(en_vocab, Parser): def get_new_parser(): - new_parser = Parser(en_vocab) - new_parser.model, _ = new_parser.Model(0) + new_parser = Parser(en_vocab, default_parser()) return new_parser - parser = Parser(en_vocab) - parser.model, _ = parser.Model(0) + parser = Parser(en_vocab, default_parser()) parser.cfg["foo"] = "bar" new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"])) assert "foo" in new_parser.cfg @@ -144,7 +143,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser): def test_serialize_sentencerecognizer(en_vocab): - sr = SentenceRecognizer(en_vocab) + sr = SentenceRecognizer(en_vocab, default_sentrec()) sr_b = sr.to_bytes() - sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b) + sr_d = SentenceRecognizer(en_vocab, default_sentrec()).from_bytes(sr_b) assert sr.to_bytes() == sr_d.to_bytes() diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 2d10d79d4..310103d10 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,6 +1,6 @@ import pytest -from spacy.ml.component_models import Tok2Vec +from spacy.ml.models.tok2vec import build_Tok2Vec_model from spacy.vocab import Vocab from spacy.tokens import Doc @@ -25,7 +25,8 @@ def test_empty_doc(): embed_size = 2000 vocab = Vocab() doc = Doc(vocab, words=[]) - tok2vec = Tok2Vec(width, embed_size) + # TODO: fix tok2vec arguments + tok2vec = build_Tok2Vec_model(width, embed_size) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) @@ -36,7 +37,19 @@ def test_empty_doc(): ) def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) - tok2vec = Tok2Vec(width, embed_size) + tok2vec = build_Tok2Vec_model( + width, + embed_size, + pretrained_vectors=None, + conv_depth=4, + bilstm_depth=0, + window_size=1, + maxout_pieces=3, + subword_features=True, + char_embed=False, + nM=64, + nC=8, + ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) assert len(vectors) == len(batch) @@ -44,19 +57,24 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): assert doc_vec.shape == (len(doc), width) +# fmt: off @pytest.mark.parametrize( "tok2vec_config", [ - {"width": 8, "embed_size": 100, "char_embed": False}, - {"width": 8, "embed_size": 100, "char_embed": True}, - {"width": 8, "embed_size": 100, "conv_depth": 6}, - {"width": 8, "embed_size": 100, "conv_depth": 6}, - {"width": 8, "embed_size": 100, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, ], ) +# fmt: on def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) - tok2vec = Tok2Vec(**tok2vec_config) + tok2vec = build_Tok2Vec_model(**tok2vec_config) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) diff --git a/spacy/util.py b/spacy/util.py index 465b9645e..286a6574c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -6,8 +6,7 @@ from pathlib import Path import random from typing import List import thinc -import thinc.config -from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu +from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config import functools import itertools import numpy.random @@ -146,6 +145,10 @@ def load_model_from_path(model_path, meta=False, **overrides): pipeline from meta.json and then calls from_disk() with path.""" if not meta: meta = get_model_meta(model_path) + nlp_config = get_model_config(model_path) + if nlp_config.get("nlp", None): + return load_model_from_config(nlp_config["nlp"]) + # Support language factories registered via entry points (e.g. custom # language subclass) while keeping top-level language identifier "lang" lang = meta.get("lang_factory", meta["lang"]) @@ -162,11 +165,30 @@ def load_model_from_path(model_path, meta=False, **overrides): if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) factory = factories.get(name, name) + if nlp_config.get(name, None): + model_config = nlp_config[name]["model"] + config["model"] = model_config component = nlp.create_pipe(factory, config=config) nlp.add_pipe(component, name=name) return nlp.from_disk(model_path, exclude=disable) +def load_model_from_config(nlp_config): + if "name" in nlp_config: + nlp = load_model(**nlp_config) + elif "lang" in nlp_config: + lang_class = get_lang_class(nlp_config["lang"]) + nlp = lang_class() + else: + raise ValueError(Errors.E993) + if "pipeline" in nlp_config: + for name, component_cfg in nlp_config["pipeline"].items(): + factory = component_cfg.pop("factory") + component = nlp.create_pipe(factory, config=component_cfg) + nlp.add_pipe(component, name=name) + return nlp + + def load_model_from_init_py(init_file, **overrides): """Helper function to use in the `load()` method of a model package's __init__.py. @@ -184,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides): return load_model_from_path(data_path, meta, **overrides) -def load_from_config(path, create_objects=False): +def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. @@ -212,7 +234,7 @@ def get_model_meta(path): raise IOError(Errors.E052.format(path=model_path)) meta_path = model_path / "meta.json" if not meta_path.is_file(): - raise IOError(Errors.E053.format(path=meta_path)) + raise IOError(Errors.E053.format(path=meta_path, name="meta.json")) meta = srsly.read_json(meta_path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: @@ -220,6 +242,23 @@ def get_model_meta(path): return meta +def get_model_config(path): + """Get the model's config from a directory path. + + path (unicode or Path): Path to model directory. + RETURNS (Config): The model's config data. + """ + model_path = ensure_path(path) + if not model_path.exists(): + raise IOError(Errors.E052.format(path=model_path)) + config_path = model_path / "config.cfg" + # model directories are allowed not to have config files ? + if not config_path.is_file(): + return Config({}) + # raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) + return Config().from_disk(config_path) + + def is_package(name): """Check if string maps to a package installed via pip. From 5da3ad682a0109dad6d2e6e1a45f4a833fefc929 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 28 Feb 2020 11:57:41 +0100 Subject: [PATCH 069/187] Tidy up and auto-format --- spacy/cli/pretrain.py | 2 +- spacy/cli/train.py | 14 +++-- spacy/cli/train_from_config.py | 4 +- spacy/language.py | 51 ++++++++++++------- spacy/ml/models/__init__.py | 12 ++--- spacy/ml/models/entity_linker.py | 4 +- spacy/ml/models/parser.py | 9 ++-- spacy/ml/models/tagger.py | 2 +- spacy/ml/models/textcat.py | 13 +++-- spacy/ml/models/tok2vec.py | 2 +- spacy/pipeline/tok2vec.py | 1 - spacy/tests/parser/test_add_label.py | 3 +- spacy/tests/parser/test_ner.py | 3 +- spacy/tests/regression/test_issue1501-2000.py | 2 +- .../tests/serialize/test_serialize_config.py | 22 ++++++-- .../serialize/test_serialize_pipeline.py | 7 ++- spacy/tests/test_tok2vec.py | 4 +- 17 files changed, 96 insertions(+), 59 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 95d549254..b2e3229ee 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -120,7 +120,7 @@ def pretrain( window_size=1, char_embed=False, nM=64, - nC=8 + nC=8, ), ) # Load in pretrained weights diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5667bb905..1ca678b85 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -9,7 +9,7 @@ from wasabi import msg import contextlib import random -from ..util import create_default_optimizer, registry +from ..util import create_default_optimizer from ..util import use_gpu as set_gpu from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus @@ -161,7 +161,10 @@ def train( raise ValueError(f"Component {pipe} currently not supported.") pipe_cfg = util.load_config(config_loc, create_objects=False) if vectors: - pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} + pretrained_config = { + "@architectures": "spacy.VocabVectors.v1", + "name": vectors, + } pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config if pipe == "parser": @@ -202,7 +205,7 @@ def train( msg.text(f"Starting with blank model '{lang}'") lang_cls = util.get_lang_class(lang) nlp = lang_cls() - + if vectors: msg.text(f"Loading vectors from model '{vectors}'") @@ -222,7 +225,10 @@ def train( raise ValueError(f"Component {pipe} currently not supported.") pipe_cfg = util.load_config(config_loc, create_objects=False) if vectors: - pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors} + pretrained_config = { + "@architectures": "spacy.VocabVectors.v1", + "name": vectors, + } pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config if pipe == "parser": diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 0dba8a962..5b09909c7 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,10 +1,8 @@ from typing import Optional, Dict, List, Union, Sequence -from pydantic import BaseModel, FilePath, StrictInt - +from pydantic import BaseModel, FilePath import plac import tqdm from pathlib import Path - from wasabi import msg import thinc import thinc.schedules diff --git a/spacy/language.py b/spacy/language.py index 83f8c9d21..af9f2c157 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -130,7 +130,13 @@ class Language(object): factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)} def __init__( - self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs + self, + vocab=True, + make_doc=True, + max_length=10 ** 6, + meta={}, + config=None, + **kwargs, ): """Initialise a Language object. @@ -176,20 +182,29 @@ class Language(object): self.max_length = max_length self._optimizer = None - from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \ - default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \ - default_tensorizer_config, default_tok2vec_config + from .ml.models.defaults import ( + default_tagger_config, + default_parser_config, + default_ner_config, + default_textcat_config, + default_nel_config, + default_morphologizer_config, + default_sentrec_config, + default_tensorizer_config, + default_tok2vec_config, + ) - self.defaults = {"tagger": default_tagger_config(), - "parser": default_parser_config(), - "ner": default_ner_config(), - "textcat": default_textcat_config(), - "entity_linker": default_nel_config(), - "morphologizer": default_morphologizer_config(), - "sentrec": default_sentrec_config(), - "tensorizer": default_tensorizer_config(), - "tok2vec": default_tok2vec_config(), - } + self.defaults = { + "tagger": default_tagger_config(), + "parser": default_parser_config(), + "ner": default_ner_config(), + "textcat": default_textcat_config(), + "entity_linker": default_nel_config(), + "morphologizer": default_morphologizer_config(), + "sentrec": default_sentrec_config(), + "tensorizer": default_tensorizer_config(), + "tok2vec": default_tok2vec_config(), + } @property def path(self): @@ -329,12 +344,14 @@ class Language(object): model_cfg = None del config["model"] if model_cfg is None and default_config is not None: - user_warning(Warnings.W098) + user_warning(Warnings.W098.format(name=name)) model_cfg = default_config["model"] model = None if model_cfg is not None: - self.config[name] = {"model": model_cfg} - model = registry.make_from_config({"model": model_cfg}, validate=True)["model"] + self.config[name] = {"model": model_cfg} + model = registry.make_from_config({"model": model_cfg}, validate=True)[ + "model" + ] return factory(self, model, **config) def add_pipe( diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index 56696d581..d44c7cb2e 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,6 @@ -from .entity_linker import * -from .parser import * -from .tagger import * -from .tensorizer import * -from .textcat import * -from .tok2vec import * +from .entity_linker import * # noqa +from .parser import * # noqa +from .tagger import * # noqa +from .tensorizer import * # noqa +from .textcat import * # noqa +from .tok2vec import * # noqa diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 0c1762026..9cbaba984 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,9 +1,7 @@ -from pathlib import Path - from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear -from spacy.util import registry +from ...util import registry @registry.architectures.register("spacy.EntityLinker.v1") diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 89f303e2a..d2de10a0e 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,11 +1,10 @@ from pydantic import StrictInt - -from spacy.util import registry -from spacy.ml._layers import PrecomputableAffine -from spacy.syntax._parser_model import ParserModel - from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops +from ...util import registry +from .._layers import PrecomputableAffine +from ...syntax._parser_model import ParserModel + @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 92e8be1b2..baca325bd 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,6 +1,6 @@ from thinc.api import zero_init, with_array, Softmax, chain, Model -from spacy.util import registry +from ...util import registry @registry.architectures.register("spacy.Tagger.v1") diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index d9ac34b99..49679c8cd 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,8 +1,9 @@ -from spacy.attrs import ORTH -from spacy.util import registry -from spacy.ml.extract_ngrams import extract_ngrams +from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic +from thinc.api import SparseLinear, Softmax -from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax +from ...attrs import ORTH +from ...util import registry +from ..extract_ngrams import extract_ngrams @registry.architectures.register("spacy.TextCatCNN.v1") @@ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): else: # TODO: experiment with init_w=zero_init linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) - model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() + model = ( + tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() + ) model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 2e0e4c2d4..0d33d010d 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -119,7 +119,7 @@ def hash_embed_bilstm_v1( @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") -def hash_embed_bilstm_v1( +def hash_char_embed_bilstm_v1( pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 ): # Allows using character embeddings by setting nC, nM and char_embed=True diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index a49f94ca3..2fee6881a 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc @component("tok2vec", assigns=["doc.tensor"]) class Tok2Vec(Pipe): - @classmethod def from_nlp(cls, nlp, model, **cfg): return cls(nlp.vocab, model, **cfg) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 5af772ddc..fb43458ae 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -73,7 +73,8 @@ def test_add_label_deserializes_correctly(): @pytest.mark.parametrize( - "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())] + "pipe_cls,n_moves,model", + [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())], ) def test_add_label_get_label(pipe_cls, n_moves, model): """Test that added labels are returned correctly. This test was added to diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 3fde75eb5..2fd87ead3 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -212,7 +212,8 @@ def test_empty_ner(): nlp.begin_training() doc = nlp("John is watching the news about Croatia's elections") # if this goes wrong, the initialization of the parser's upper layer is probably broken - assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"] + assert [token.ent_iob_ for token in doc] == result def test_ruler_before_ner(): diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index ff8c7c2fe..5f5f0c9eb 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -237,7 +237,7 @@ def test_issue1889(word): assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) -@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3") +@pytest.mark.skip(reason="obsolete with the config refactor of v.3") def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index c34d01547..298cddc74 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -58,10 +58,22 @@ subword_features = false @registry.architectures.register("my_test_parser") def my_parser(): - tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3, - maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8, - conv_depth=2, bilstm_depth=0) - parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5) + tok2vec = build_Tok2Vec_model( + width=321, + embed_size=5432, + pretrained_vectors=None, + window_size=3, + maxout_pieces=4, + subword_features=True, + char_embed=True, + nM=64, + nC=8, + conv_depth=2, + bilstm_depth=0, + ) + parser = build_tb_parser_model( + tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 + ) return parser @@ -88,7 +100,7 @@ def test_serialize_custom_nlp(): """ Create a custom nlp pipeline and ensure it serializes it correctly""" nlp = English() parser_cfg = dict() - parser_cfg["model"] = {'@architectures': "my_test_parser"} + parser_cfg["model"] = {"@architectures": "my_test_parser"} parser = nlp.create_pipe("parser", parser_cfg) nlp.add_pipe(parser) nlp.begin_training() diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index fe14fba10..b1070a9e7 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,7 +1,8 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer -from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec +from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger +from spacy.ml.models.defaults import default_textcat, default_sentrec from ..util import make_tempdir @@ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): def test_serialize_textcat_empty(en_vocab): # See issue #1105 - textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]) + textcat = TextCategorizer( + en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"] + ) textcat.to_bytes(exclude=["vocab"]) diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 310103d10..e1ad1f0fc 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -67,8 +67,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, ], ) # fmt: on From 37691e6d5deb38fd1788fe0a4761f1bcd66200c5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 28 Feb 2020 12:20:23 +0100 Subject: [PATCH 070/187] Simplify warnings --- spacy/__init__.py | 4 +- spacy/analysis.py | 5 ++- spacy/cli/init_model.py | 5 ++- spacy/displacy/__init__.py | 10 +++-- spacy/errors.py | 66 --------------------------------- spacy/gold.pyx | 5 ++- spacy/kb.pyx | 16 ++++---- spacy/language.py | 17 +++++---- spacy/lexeme.pyx | 5 ++- spacy/matcher/matcher.pyx | 5 ++- spacy/matcher/phrasematcher.pyx | 13 ++++--- spacy/morphology.pyx | 7 ++-- spacy/pipeline/pipes.pyx | 5 ++- spacy/syntax/nn_parser.pyx | 3 +- spacy/tests/doc/test_doc_api.py | 3 +- spacy/tests/doc/test_span.py | 3 +- spacy/tokenizer.pyx | 7 ++-- spacy/tokens/doc.pyx | 12 +++--- spacy/tokens/span.pyx | 10 ++--- spacy/tokens/token.pyx | 7 ++-- spacy/util.py | 7 ++-- 21 files changed, 82 insertions(+), 133 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 2c063ce24..e4e1f6c8e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -11,7 +11,7 @@ from . import pipeline from .cli.info import info as cli_info from .glossary import explain from .about import __version__ -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings from . import util from .util import registry from .language import component @@ -27,7 +27,7 @@ config = registry def load(name, **overrides): depr_path = overrides.get("path") if depr_path not in (True, False, None): - deprecation_warning(Warnings.W001.format(path=depr_path)) + warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning) return util.load_model(name, **overrides) diff --git a/spacy/analysis.py b/spacy/analysis.py index ed6d6b18e..c2600048f 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -1,7 +1,8 @@ from wasabi import Printer +import warnings from .tokens import Doc, Token, Span -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings def analyze_pipes(pipeline, name, pipe, index, warn=True): @@ -30,7 +31,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): if not fulfilled: problems.append(annot) if warn: - user_warning(Warnings.W025.format(name=name, attr=annot)) + warnings.warn(Warnings.W025.format(name=name, attr=annot)) return problems diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index babef106c..4b4949179 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -9,9 +9,10 @@ import gzip import zipfile import srsly from wasabi import msg +import warnings from ..vectors import Vectors -from ..errors import Errors, Warnings, user_warning +from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class try: @@ -227,7 +228,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_clusters(clusters_loc): clusters = {} if ftfy is None: - user_warning(Warnings.W004) + warnings.warn(Warnings.W004) with clusters_loc.open() as f: for line in tqdm(f): try: diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index e4a8ad666..36b34e5a0 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -4,9 +4,11 @@ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ +import warnings + from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span -from ..errors import Errors, Warnings, user_warning +from ..errors import Errors, Warnings from ..util import is_in_jupyter @@ -85,7 +87,7 @@ def serve( from wsgiref import simple_server if is_in_jupyter(): - user_warning(Warnings.W011) + warnings.warn(Warnings.W011) render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) @@ -115,7 +117,7 @@ def parse_deps(orig_doc, options={}): """ doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) if not doc.is_parsed: - user_warning(Warnings.W005) + warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): with doc.retokenize() as retokenizer: for np in list(doc.noun_chunks): @@ -173,7 +175,7 @@ def parse_ents(doc, options={}): for ent in doc.ents ] if not ents: - user_warning(Warnings.W006) + warnings.warn(Warnings.W006) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None settings = get_doc_settings(doc) return {"text": doc.text, "ents": ents, "title": title, "settings": settings} diff --git a/spacy/errors.py b/spacy/errors.py index 6afbfc3c6..33603eb1f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,8 +1,3 @@ -import os -import warnings -import inspect - - def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -583,64 +578,3 @@ class MatchPatternError(ValueError): class AlignmentError(ValueError): pass - - -class ModelsWarning(UserWarning): - pass - - -WARNINGS = { - "user": UserWarning, - "deprecation": DeprecationWarning, - "models": ModelsWarning, -} - - -def _get_warn_types(arg): - if arg == "": # don't show any warnings - return [] - if not arg or arg == "all": # show all available warnings - return WARNINGS.keys() - return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS] - - -def _get_warn_excl(arg): - if not arg: - return [] - return [w_id.strip() for w_id in arg.split(",")] - - -SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER") -SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES")) -SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE")) - - -def user_warning(message): - _warn(message, "user") - - -def deprecation_warning(message): - _warn(message, "deprecation") - - -def models_warning(message): - _warn(message, "models") - - -def _warn(message, warn_type="user"): - """ - message (unicode): The message to display. - category (Warning): The Warning to show. - """ - if message.startswith("["): - w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string - else: - w_id = None - ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE - if warn_type in SPACY_WARNING_TYPES and not ignore_warning: - category = WARNINGS[warn_type] - stack = inspect.stack()[-1] - with warnings.catch_warnings(): - if SPACY_WARNING_FILTER: - warnings.simplefilter(SPACY_WARNING_FILTER, category) - warnings.warn_explicit(message, category, stack[1], stack[2]) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index eca801176..37d092395 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -7,10 +7,11 @@ import shutil import itertools from pathlib import Path import srsly +import warnings from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError, user_warning, Warnings +from .errors import Errors, AlignmentError, Warnings from . import util @@ -550,7 +551,7 @@ def _json_iterate(loc): py_raw = file_.read() cdef long file_length = len(py_raw) if file_length > 2 ** 30: - user_warning(Warnings.W027.format(size=file_length)) + warnings.warn(Warnings.W027.format(size=file_length)) raw = py_raw cdef int square_depth = 0 diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 64fbb1e29..797702d23 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,16 +1,18 @@ # cython: infer_types=True # cython: profile=True -from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from cpython.exc cimport PyErr_SetFromErrno from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t -from os import path from libcpp.vector cimport vector +from pathlib import Path +import warnings +from os import path + from .typedefs cimport hash_t -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings cdef class Candidate: @@ -110,7 +112,7 @@ cdef class KnowledgeBase: # Return if this entity was added before if entity_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity)) + warnings.warn(Warnings.W018.format(entity=entity)) return # Raise an error if the provided entity vector is not of the correct length @@ -142,7 +144,7 @@ cdef class KnowledgeBase: # only process this entity if its unique ID hadn't been added before entity_hash = self.vocab.strings.add(entity_list[i]) if entity_hash in self._entry_index: - user_warning(Warnings.W018.format(entity=entity_list[i])) + warnings.warn(Warnings.W018.format(entity=entity_list[i])) else: entity_vector = vector_list[i] @@ -190,7 +192,7 @@ cdef class KnowledgeBase: # Check whether this alias was added before if alias_hash in self._alias_index: - user_warning(Warnings.W017.format(alias=alias)) + warnings.warn(Warnings.W017.format(alias=alias)) return cdef vector[int64_t] entry_indices @@ -247,7 +249,7 @@ cdef class KnowledgeBase: if is_present: if not ignore_warnings: - user_warning(Warnings.W024.format(entity=entity, alias=alias)) + warnings.warn(Warnings.W024.format(entity=entity, alias=alias)) else: entry_indices.push_back(int(entry_index)) alias_entry.entry_indices = entry_indices diff --git a/spacy/language.py b/spacy/language.py index af9f2c157..9f5f9d86a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -5,6 +5,7 @@ import functools from contextlib import contextmanager from copy import copy, deepcopy from pathlib import Path +import warnings from thinc.api import get_current_ops, Config import srsly @@ -26,7 +27,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings from . import util from . import about @@ -340,11 +341,11 @@ class Language(object): if "model" in config: model_cfg = config["model"] if not isinstance(model_cfg, dict): - user_warning(Warnings.W099.format(type=type(model_cfg), pipe=name)) + warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) model_cfg = None del config["model"] if model_cfg is None and default_config is not None: - user_warning(Warnings.W098.format(name=name)) + warnings.warn(Warnings.W098.format(name=name)) model_cfg = default_config["model"] model = None if model_cfg is not None: @@ -779,7 +780,7 @@ class Language(object): # raw_texts will be used later to stop iterator. texts, raw_texts = itertools.tee(texts) if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if n_process == -1: n_process = mp.cpu_count() if as_tuples: @@ -915,7 +916,7 @@ class Language(object): DOCS: https://spacy.io/api/language#to_disk """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) serializers = {} @@ -949,7 +950,7 @@ class Language(object): DOCS: https://spacy.io/api/language#from_disk """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) deserializers = {} @@ -987,7 +988,7 @@ class Language(object): DOCS: https://spacy.io/api/language#to_bytes """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable serializers = {} serializers["vocab"] = lambda: self.vocab.to_bytes() @@ -1013,7 +1014,7 @@ class Language(object): DOCS: https://spacy.io/api/language#from_bytes """ if disable is not None: - deprecation_warning(Warnings.W014) + warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = {} deserializers["config.cfg"] = lambda b: self.config.from_bytes(b) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5910ebfe1..20e175f03 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -7,6 +7,7 @@ np.import_array() import numpy from thinc.api import get_array_module +import warnings from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE @@ -15,7 +16,7 @@ from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from .attrs cimport IS_CURRENCY, IS_OOV, PROB from .attrs import intify_attrs -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) @@ -124,7 +125,7 @@ cdef class Lexeme: if self.c.orth == other[0].orth: return 1.0 if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Lexeme")) + warnings.warn(Warnings.W008.format(obj="Lexeme")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 4258fdb6a..735bc5a44 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly +import warnings from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -16,7 +17,7 @@ from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA from ..schemas import validate_token_pattern -from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning +from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS @@ -188,7 +189,7 @@ cdef class Matcher: YIELDS (Doc): Documents, in order. """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in docs: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 961a318f6..b17a53e3a 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -1,16 +1,17 @@ # cython: infer_types=True # cython: profile=True from libc.stdint cimport uintptr_t - from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter +import warnings + from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..structs cimport TokenC from ..tokens.token cimport Token from ..typedefs cimport attr_t from ..schemas import TokenPattern -from ..errors import Errors, Warnings, deprecation_warning, user_warning +from ..errors import Errors, Warnings cdef class PhraseMatcher: @@ -37,7 +38,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#init """ if max_length != 0: - deprecation_warning(Warnings.W010) + warnings.warn(Warnings.W010, DeprecationWarning) self.vocab = vocab self._callbacks = {} self._docs = {} @@ -193,7 +194,7 @@ cdef class PhraseMatcher: if self._validate and (doc.is_tagged or doc.is_parsed) \ and self.attr not in (DEP, POS, TAG, LEMMA): string_attr = self.vocab.strings[self.attr] - user_warning(Warnings.W012.format(key=key, attr=string_attr)) + warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) keyword = self._convert_to_array(doc) else: keyword = doc @@ -202,7 +203,7 @@ cdef class PhraseMatcher: current_node = self.c_map for token in keyword: if token == self._terminal_hash: - user_warning(Warnings.W021) + warnings.warn(Warnings.W021) break result = map_get(current_node, token) if not result: @@ -304,7 +305,7 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#pipe """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) if as_tuples: for doc, context in stream: matches = self(doc) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 3003d118f..89870b121 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -3,6 +3,7 @@ from libc.string cimport memset import srsly from collections import Counter import numpy +import warnings from .strings import get_string_id from . import symbols @@ -11,7 +12,7 @@ from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme -from .errors import Errors, Warnings, user_warning +from .errors import Errors, Warnings from .util import ensure_path @@ -39,7 +40,7 @@ def _normalize_props(props): elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value else: - user_warning(Warnings.W028.format(feature={key: value})) + warnings.warn(Warnings.W028.format(feature={key: value})) return out @@ -109,7 +110,7 @@ cdef class Morphology: return tag_ptr.key features = self.feats_to_dict(features) if not isinstance(features, dict): - user_warning(Warnings.W028.format(feature=features)) + warnings.warn(Warnings.W028.format(feature=features)) features = {} features = _normalize_props(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b9bf1ccd6..3b74d2960 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -5,6 +5,7 @@ import srsly import random from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate +import warnings from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser @@ -21,7 +22,7 @@ from ..attrs import POS, ID from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X from ..kb import KnowledgeBase -from ..errors import Errors, TempErrors, user_warning, Warnings +from ..errors import Errors, TempErrors, Warnings from .. import util @@ -525,7 +526,7 @@ class Tagger(Pipe): **kwargs): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): - user_warning(Warnings.W022) + warnings.warn(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = {} for example in get_examples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 9381fab6b..312ae9d61 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -19,6 +19,7 @@ from itertools import islice import srsly import numpy.random import numpy +import warnings from ..gold import Example from ..typedefs cimport weight_t, class_t, hash_t @@ -31,7 +32,7 @@ from ..util import link_vectors_to_models, create_default_optimizer, registry from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, user_warning, Warnings +from ..errors import Errors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 4323bb736..87a8f4585 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,7 +2,6 @@ import pytest import numpy from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.errors import ModelsWarning from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP from ..util import get_doc @@ -213,7 +212,7 @@ def test_doc_api_similarity_match(): assert doc.similarity(doc[0]) == 1.0 assert doc.similarity(doc.vocab["a"]) == 1.0 doc2 = Doc(doc.vocab, words=["a", "b", "c"]) - with pytest.warns(ModelsWarning): + with pytest.warns(UserWarning): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0 diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index d7b91d476..43c699d21 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -2,7 +2,6 @@ import pytest from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.errors import ModelsWarning from spacy.util import filter_spans from ..util import get_doc @@ -121,7 +120,7 @@ def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) span1 = doc[:2] span2 = doc[2:] - with pytest.warns(ModelsWarning): + with pytest.warns(UserWarning): assert span1.similarity(span2) == 1.0 assert span1.similarity(doc) == 0.0 assert span1[:1].similarity(doc.vocab["a"]) == 1.0 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 25d9f239d..f31c8a0e5 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,13 +11,14 @@ from preshed.maps cimport PreshMap cimport cython import re +import warnings from .tokens.doc cimport Doc from .strings cimport hash_string from .attrs import intify_attrs from .symbols import ORTH -from .errors import Errors, Warnings, deprecation_warning +from .errors import Errors, Warnings from . import util from .attrs import intify_attrs from .lexeme cimport EMPTY_LEXEME @@ -128,7 +129,7 @@ cdef class Tokenizer: return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): - deprecation_warning(Warnings.W002) + warnings.warn(Warnings.W002, DeprecationWarning) return Doc(self.vocab, words=strings) def __call__(self, unicode string): @@ -216,7 +217,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#pipe """ if n_threads != -1: - deprecation_warning(Warnings.W016) + warnings.warn(Warnings.W016, DeprecationWarning) for text in texts: yield self(text) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 54d92f8b1..14c6d0bbb 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -13,6 +13,7 @@ import struct import srsly from thinc.api import get_array_module from thinc.util import copy_array +import warnings from .span cimport Span from .token cimport Token @@ -26,7 +27,6 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS from ..util import normalize_slice from ..compat import copy_reg, pickle -from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings from .. import util from .underscore import Underscore, get_ext_args @@ -388,9 +388,9 @@ cdef class Doc: else: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Doc")) + warnings.warn(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Doc")) + warnings.warn(Warnings.W008.format(obj="Doc")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -1024,10 +1024,10 @@ cdef class Doc: indices did not fall at token boundaries. """ cdef unicode tag, lemma, ent_type - deprecation_warning(Warnings.W013.format(obj="Doc")) + warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning) # TODO: ENT_KB_ID ? if len(args) == 3: - deprecation_warning(Warnings.W003) + warnings.warn(Warnings.W003, DeprecationWarning) tag, lemma, ent_type = args attributes[TAG] = tag attributes[LEMMA] = lemma @@ -1167,7 +1167,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: while not heads_within_sents: heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) if loop_count > 10: - user_warning(Warnings.W026) + warnings.warn(Warnings.W026) loop_count += 1 # Set sentence starts for i in range(length): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d6b50b5f4..b6ff763b0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -6,6 +6,7 @@ import numpy import numpy.linalg from thinc.api import get_array_module from collections import defaultdict +import warnings from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix from .token cimport TokenC @@ -18,8 +19,7 @@ from ..lexeme cimport Lexeme from ..symbols cimport dep from ..util import normalize_slice -from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning -from ..errors import deprecation_warning +from ..errors import Errors, TempErrors, Warnings from .underscore import Underscore, get_ext_args @@ -287,7 +287,7 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ - deprecation_warning(Warnings.W013.format(obj="Span")) + warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning) return self.doc.merge(self.start_char, self.end_char, *args, **attributes) @@ -326,9 +326,9 @@ cdef class Span: else: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Span")) + warnings.warn(Warnings.W007.format(obj="Span")) if self.vector_norm == 0.0 or other.vector_norm == 0.0: - user_warning(Warnings.W008.format(obj="Span")) + warnings.warn(Warnings.W008.format(obj="Span")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 379da6c77..023581d1f 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -8,6 +8,7 @@ np.import_array() import numpy from thinc.api import get_array_module +import warnings from ..typedefs cimport hash_t from ..lexeme cimport Lexeme @@ -20,7 +21,7 @@ from ..symbols cimport conj from .. import parts_of_speech from .. import util -from ..errors import Errors, Warnings, user_warning, models_warning +from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args from .morphanalysis cimport MorphAnalysis @@ -205,9 +206,9 @@ cdef class Token: if self.c.lex.orth == other.orth: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj="Token")) + warnings.warn(Warnings.W007.format(obj="Token")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj="Token")) + warnings.warn(Warnings.W008.format(obj="Token")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/util.py b/spacy/util.py index 286a6574c..216158e52 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,6 +13,7 @@ import numpy.random import srsly import catalogue import sys +import warnings try: @@ -22,7 +23,7 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings _PRINT_ENV = False @@ -731,7 +732,7 @@ def get_serialization_exclude(serializers, exclude, kwargs): options = [name.split(".")[0] for name in serializers] for key, value in kwargs.items(): if key in ("vocab",) and value is False: - deprecation_warning(Warnings.W015.format(arg=key)) + warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning) exclude.append(key) elif key.split(".")[0] in options: raise ValueError(Errors.E128.format(arg=key)) @@ -776,7 +777,7 @@ def link_vectors_to_models(vocab): if vectors.name is None: vectors.name = VECTORS_KEY if vectors.data.size != 0: - user_warning(Warnings.W020.format(shape=vectors.data.shape)) + warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) for word in vocab: if word.orth in vectors.key2row: word.rank = vectors.key2row[word.orth] From 7efaa76168103b4c6e13d1852d805e34418666cf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 28 Feb 2020 12:23:31 +0100 Subject: [PATCH 071/187] Update errors.py --- spacy/errors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 33603eb1f..947898b31 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -86,9 +86,8 @@ class Warnings(object): "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " "or the language you're using doesn't have lemmatization data, " - "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "you can ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " From 648f61d07710f53b2972d1925f59d23a0f9247e4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 2 Mar 2020 11:48:10 +0100 Subject: [PATCH 072/187] Tidy up compiler flags and imports (#5071) --- spacy/gold.pxd | 3 ++- spacy/kb.pxd | 4 ++-- spacy/kb.pyx | 9 +++++---- spacy/lexeme.pxd | 5 ++--- spacy/matcher/dependencymatcher.pyx | 8 ++++---- spacy/matcher/matcher.pyx | 3 +-- spacy/matcher/phrasematcher.pxd | 1 - spacy/matcher/phrasematcher.pyx | 4 +--- spacy/morphology.pxd | 6 +++--- spacy/morphology.pyx | 10 ++++++---- spacy/pipeline/morphologizer.pyx | 16 ++++++++-------- spacy/pipeline/pipes.pyx | 3 +-- spacy/strings.pxd | 1 - spacy/strings.pyx | 4 +++- spacy/structs.pxd | 6 ++---- spacy/syntax/_beam_utils.pyx | 13 ++++++------- spacy/syntax/_parser_model.pyx | 20 ++++++++++---------- spacy/syntax/_state.pxd | 2 -- spacy/syntax/arc_eager.pxd | 6 +----- spacy/syntax/arc_eager.pyx | 21 +++++++++++---------- spacy/syntax/ner.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 21 ++++++++++----------- spacy/syntax/nonproj.pyx | 6 +++--- spacy/syntax/transition_system.pxd | 1 - spacy/syntax/transition_system.pyx | 4 +++- spacy/tokenizer.pxd | 1 - spacy/tokenizer.pyx | 7 +++---- spacy/tokens/_retokenize.pyx | 4 +--- spacy/tokens/doc.pyx | 6 ++---- spacy/tokens/span.pyx | 1 + spacy/tokens/token.pxd | 1 + spacy/tokens/token.pyx | 2 +- spacy/vocab.pxd | 1 - 33 files changed, 95 insertions(+), 107 deletions(-) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index aea691130..c5ab6ebbe 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -1,9 +1,10 @@ from cymem.cymem cimport Pool -from .tokens import Doc from .typedefs cimport attr_t from .syntax.transition_system cimport Transition +from .tokens import Doc + cdef struct GoldParseC: int* tags diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 518ce0f4e..53038b5db 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -1,15 +1,15 @@ """Knowledge-base for entity or concept linking.""" from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from libc.stdio cimport FILE from .vocab cimport Vocab from .typedefs cimport hash_t - from .structs cimport KBEntryC, AliasC + + ctypedef vector[KBEntryC] entry_vec ctypedef vector[AliasC] alias_vec ctypedef vector[float] float_vec diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 64fbb1e29..4d6b47c55 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,15 +1,16 @@ -# cython: infer_types=True -# cython: profile=True -from pathlib import Path +# cython: infer_types=True, profile=True from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from cpython.exc cimport PyErr_SetFromErrno from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t -from os import path from libcpp.vector cimport vector +from pathlib import Path +from os import path + from .typedefs cimport hash_t + from .errors import Errors, Warnings, user_warning diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 048f8016e..e73f1e700 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,13 +1,12 @@ +from numpy cimport ndarray + from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .attrs cimport attr_id_t from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG - from .structs cimport LexemeC, SerializedLexemeC from .strings cimport StringStore from .vocab cimport Vocab -from numpy cimport ndarray - cdef LexemeC EMPTY_LEXEME diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index f94c66cb0..ff707a71c 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -1,7 +1,9 @@ -# cython: infer_types=True -# cython: profile=True +# cython: infer_types=True, profile=True from cymem.cymem cimport Pool from preshed.maps cimport PreshMap +from libcpp cimport bool + +import numpy from .matcher cimport Matcher from ..vocab cimport Vocab @@ -10,8 +12,6 @@ from ..tokens.doc cimport Doc from .matcher import unpickle_matcher from ..errors import Errors -from libcpp cimport bool -import numpy DELIMITER = "||" INDEX_HEAD = 1 diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 4258fdb6a..9dcf0ded9 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,5 +1,4 @@ -# cython: infer_types=True -# cython: profile=True +# cython: infer_types=True, cython: profile=True from libcpp.vector cimport vector from libc.stdint cimport int32_t from cymem.cymem cimport Pool diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd index a8e5e5085..3b42f3fab 100644 --- a/spacy/matcher/phrasematcher.pxd +++ b/spacy/matcher/phrasematcher.pxd @@ -1,5 +1,4 @@ from libcpp.vector cimport vector - from cymem.cymem cimport Pool from preshed.maps cimport key_t, MapStruct diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 961a318f6..297b05fbc 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -1,7 +1,5 @@ -# cython: infer_types=True -# cython: profile=True +# cython: infer_types=True, profile=True from libc.stdint cimport uintptr_t - from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 1e8c255b8..c57e3a1db 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -8,14 +8,14 @@ from .structs cimport TokenC, MorphAnalysisC from .strings cimport StringStore from .typedefs cimport hash_t, attr_t, flags_t from .parts_of_speech cimport univ_pos_t - from . cimport symbols + cdef class Morphology: cdef readonly Pool mem cdef readonly StringStore strings cdef PreshMap tags # Keyed by hash, value is pointer to tag - + cdef public object lemmatizer cdef readonly object tag_map cdef readonly object tag_names @@ -26,7 +26,7 @@ cdef class Morphology: cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * cdef int insert(self, MorphAnalysisC tag) except -1 - + cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 3003d118f..47df5800e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,18 +1,20 @@ # cython: infer_types from libc.string cimport memset + import srsly from collections import Counter import numpy -from .strings import get_string_id -from . import symbols from .attrs cimport POS, IS_SPACE -from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE -from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme + +from .strings import get_string_id +from .attrs import LEMMA, intify_attrs +from .parts_of_speech import IDS as POS_IDS from .errors import Errors, Warnings, user_warning from .util import ensure_path +from . import symbols def _normalize_props(props): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index b6a6045d1..be9b166bf 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,20 +1,20 @@ -from collections import defaultdict - -import numpy cimport numpy as np +import numpy +from collections import defaultdict from thinc.api import chain, list2array, to_categorical, get_array_module from thinc.util import copy_array -from .. import util -from .pipes import Pipe -from ..language import component -from ..util import link_vectors_to_models, create_default_optimizer -from ..errors import Errors, TempErrors from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology +from .. import util +from ..language import component +from ..util import link_vectors_to_models, create_default_optimizer +from ..errors import Errors, TempErrors +from .pipes import Pipe + @component("morphologizer", assigns=["token.morph", "token.pos"]) class Morphologizer(Pipe): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b9bf1ccd6..b0cb8585f 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1,5 +1,4 @@ -# cython: infer_types=True -# cython: profile=True +# cython: infer_types=True, profile=True import numpy import srsly import random diff --git a/spacy/strings.pxd b/spacy/strings.pxd index e436fb33b..ba2476ec7 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -1,7 +1,6 @@ from libc.stdint cimport int64_t from libcpp.vector cimport vector from libcpp.set cimport set - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from murmurhash.mrmr cimport hash64 diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 0605de96c..a30f11729 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -4,11 +4,13 @@ from libc.string cimport memcpy from libcpp.set cimport set from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 + import srsly +from .typedefs cimport hash_t + from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT -from .typedefs cimport hash_t from .errors import Errors from . import util diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 259fd657d..f140a4220 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,11 +1,9 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t - -from .typedefs cimport flags_t, attr_t, hash_t -from .parts_of_speech cimport univ_pos_t - from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t +from .typedefs cimport flags_t, attr_t, hash_t +from .parts_of_speech cimport univ_pos_t cdef struct LexemeC: diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 32cf9193a..03702e54e 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -1,18 +1,19 @@ -# cython: infer_types=True -# cython: profile=True +# cython: infer_types=True, profile=True cimport numpy as np -import numpy from cpython.ref cimport PyObject, Py_XDECREF from thinc.extra.search cimport Beam -from thinc.extra.search import MaxViolation from thinc.extra.search cimport MaxViolation +from thinc.extra.search import MaxViolation +import numpy + from ..typedefs cimport hash_t, class_t from .transition_system cimport TransitionSystem, Transition from ..gold cimport GoldParse -from ..errors import Errors from .stateclass cimport StateC, StateClass +from ..errors import Errors + # These are passed as callbacks to thinc.search.Beam cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: @@ -326,5 +327,3 @@ def cleanup_beam(Beam beam): seen.add(addr) else: raise ValueError(Errors.E023.format(addr=addr, i=i)) - - diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 7ff9517a5..e36a2a28b 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -1,9 +1,5 @@ -# cython: infer_types=True -# cython: cdivision=True -# cython: boundscheck=False -import numpy +# cython: infer_types=True, cdivision=True, boundscheck=False cimport cython.parallel -import numpy.random cimport numpy as np from libc.math cimport exp from libcpp.vector cimport vector @@ -11,21 +7,25 @@ from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc from cymem.cymem cimport Pool from thinc.extra.search cimport Beam -from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy +import numpy +import numpy.random +from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops + from ..typedefs cimport weight_t, class_t, hash_t -from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors -from .. import util from .stateclass cimport StateClass from .transition_system cimport Transition + +from ..compat import copy_array +from ..errors import Errors, TempErrors +from ..util import link_vectors_to_models, create_default_optimizer +from .. import util from . import _beam_utils from . import nonproj -from ..util import link_vectors_to_models, create_default_optimizer cdef WeightsC get_c_weights(model) except *: diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 141d796a4..fef4f0c92 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -1,9 +1,7 @@ from libc.string cimport memcpy, memset, memmove from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint32_t, uint64_t - from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno - from murmurhash.mrmr cimport hash64 from ..vocab cimport EMPTY_LEXEME diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 9e9593eee..14d706548 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -1,10 +1,7 @@ from cymem.cymem cimport Pool -from ..typedefs cimport weight_t - from .stateclass cimport StateClass -from ..typedefs cimport attr_t - +from ..typedefs cimport weight_t, attr_t from .transition_system cimport TransitionSystem, Transition from ..gold cimport GoldParseC @@ -15,4 +12,3 @@ cdef class ArcEager(TransitionSystem): cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil - diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 5ec169428..19be95f3f 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,23 +1,24 @@ -# cython: profile=True -# cython: cdivision=True -# cython: infer_types=True +# cython: profile=True, cdivision=True, infer_types=True from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from collections import defaultdict, Counter from thinc.extra.search cimport Beam + +from collections import defaultdict, Counter import json -from .nonproj import is_nonproj_tree from ..typedefs cimport hash_t, attr_t from ..strings cimport hash_string -from .stateclass cimport StateClass -from ._state cimport StateC -from . import nonproj -from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse, GoldParseC from ..structs cimport TokenC -from ..errors import Errors from ..tokens.doc cimport Doc, set_children_from_heads +from .stateclass cimport StateClass +from ._state cimport StateC +from .transition_system cimport move_cost_func_t, label_cost_func_t + +from ..errors import Errors +from .nonproj import is_nonproj_tree +from . import nonproj + # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 50b916fe2..ff74be601 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,4 +1,5 @@ from thinc.extra.search cimport Beam + from collections import Counter from ..typedefs cimport weight_t @@ -9,6 +10,7 @@ from .transition_system cimport do_func_t from ..gold cimport GoldParseC, GoldParse from ..lexeme cimport Lexeme from ..attrs cimport IS_SPACE + from ..errors import Errors diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 9381fab6b..cf5414628 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,6 +1,4 @@ -# cython: infer_types=True -# cython: cdivision=True -# cython: boundscheck=False +# cython: infer_types=True, cdivision=True, boundscheck=False cimport cython.parallel cimport numpy as np from cpython.ref cimport PyObject, Py_XDECREF @@ -20,23 +18,24 @@ import srsly import numpy.random import numpy -from ..gold import Example +from ..tokens.doc cimport Doc +from ..gold cimport GoldParse from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport get_c_weights, get_c_sizes -from ._parser_model import ParserModel -from ..util import link_vectors_to_models, create_default_optimizer, registry -from ..compat import copy_array -from ..tokens.doc cimport Doc -from ..gold cimport GoldParse -from ..errors import Errors, user_warning, Warnings -from .. import util from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition from . cimport _beam_utils + +from ..gold import Example +from ..util import link_vectors_to_models, create_default_optimizer, registry +from ..compat import copy_array +from ..errors import Errors, user_warning, Warnings +from .. import util +from ._parser_model import ParserModel from . import _beam_utils from . import nonproj diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 27516ffd9..1edb2e65c 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,13 +1,13 @@ -# cython: profile=True -# cython: infer_types=True +# cython: profile=True, infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from copy import copy -from ..gold import Example from ..tokens.doc cimport Doc, set_children_from_heads + +from ..gold import Example from ..errors import Errors diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index bd706a997..5fd3b5c5f 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -5,7 +5,6 @@ from ..structs cimport TokenC from ..gold cimport GoldParse from ..gold cimport GoldParseC from ..strings cimport StringStore - from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 6ab83436e..78017c84a 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,16 +1,18 @@ # cython: infer_types=True from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from ..typedefs cimport weight_t from thinc.extra.search cimport Beam + from collections import Counter import srsly +from ..typedefs cimport weight_t from . cimport _beam_utils from ..tokens.doc cimport Doc from ..structs cimport TokenC from .stateclass cimport StateClass from ..typedefs cimport attr_t + from ..errors import Errors from .. import util diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index ba22f7782..e82833701 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -1,5 +1,4 @@ from libcpp.vector cimport vector - from preshed.maps cimport PreshMap from cymem.cymem cimport Pool diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 25d9f239d..20557366e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,5 +1,4 @@ -# cython: embedsignature=True -# cython: profile=True +# cython: embedsignature=True, profile=True from __future__ import unicode_literals from cython.operator cimport dereference as deref @@ -14,13 +13,13 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string +from .lexeme cimport EMPTY_LEXEME + from .attrs import intify_attrs from .symbols import ORTH - from .errors import Errors, Warnings, deprecation_warning from . import util from .attrs import intify_attrs -from .lexeme cimport EMPTY_LEXEME from .symbols import ORTH diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 337c154a2..8df38965d 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -1,6 +1,4 @@ -# cython: infer_types=True -# cython: bounds_check=False -# cython: profile=True +# cython: infer_types=True, bounds_check=False, profile=True from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free from cymem.cymem cimport Pool diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 54d92f8b1..6206a4810 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,12 +1,10 @@ -# cython: infer_types=True -# cython: bounds_check=False -# cython: profile=True +# cython: infer_types=True, bounds_check=False, profile=True cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt -from collections import Counter +from collections import Counter import numpy import numpy.linalg import struct diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d6b50b5f4..bca69461f 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,4 +1,5 @@ from __future__ import unicode_literals + cimport numpy as np from libc.math cimport sqrt diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 0d25974f3..45c906a82 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t from ..parts_of_speech cimport univ_pos_t from .doc cimport Doc from ..lexeme cimport Lexeme + from ..errors import Errors diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 379da6c77..a450a9154 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -17,12 +17,12 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..symbols cimport conj +from .morphanalysis cimport MorphAnalysis from .. import parts_of_speech from .. import util from ..errors import Errors, Warnings, user_warning, models_warning from .underscore import Underscore, get_ext_args -from .morphanalysis cimport MorphAnalysis cdef class Token: diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index d989d6c40..a95ffb11a 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -1,5 +1,4 @@ from libcpp.vector cimport vector - from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 From 6ac9fc06192c0cdb3ef06f3dcd8f5bee4e39e6b1 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 6 Mar 2020 14:42:23 +0100 Subject: [PATCH 073/187] Unit test for NEL functionality (#5114) * empty begin_training for sentencizer * overfitting unit test for entity linker * fixed NEL IO by storing the entity_vector_length in the cfg --- spacy/pipeline/pipes.pyx | 6 ++ spacy/tests/pipeline/test_entity_linker.py | 72 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 0b1bd8ccf..4ee470606 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1490,6 +1490,7 @@ class EntityLinker(Pipe): def to_disk(self, path, exclude=tuple(), **kwargs): serialize = {} + self.cfg["entity_width"] = self.kb.entity_vector_length serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) @@ -1561,6 +1562,11 @@ class Sentencizer(Pipe): def from_nlp(cls, nlp, model=None, **cfg): return cls(**cfg) + def begin_training( + self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs + ): + pass + def __call__(self, example): """Apply the sentencizer to a Doc and set Token.is_sent_start. diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 9ff5f8194..cdd8451fd 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,8 +1,11 @@ import pytest from spacy.kb import KnowledgeBase + +from spacy import util from spacy.lang.en import English from spacy.pipeline import EntityRuler +from spacy.tests.util import make_tempdir from spacy.tokens import Span @@ -245,3 +248,72 @@ def test_preserving_links_ents_2(nlp): assert len(list(doc.ents)) == 1 assert list(doc.ents)[0].label_ == "LOC" assert list(doc.ents)[0].kb_id_ == "Q1" + + +# fmt: off +TRAIN_DATA = [ + ("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}), + ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}), + ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}), + ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}), +] +GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] +# fmt: on + + +def test_overfitting_IO(): + # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly + nlp = English() + nlp.add_pipe(nlp.create_pipe('sentencizer')) + + # Add a custom component to recognize "Russ Cochran" as an entity for the example training data + ruler = EntityRuler(nlp) + patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + # Convert the texts to docs to make sure we have doc.ents set for the training examples + TRAIN_DOCS = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + annotation_clean = annotation + TRAIN_DOCS.append((doc, annotation_clean)) + + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) + + # Create the Entity Linker component and add it to the pipeline + entity_linker = nlp.create_pipe("entity_linker") + entity_linker.set_kb(mykb) + nlp.add_pipe(entity_linker, last=True) + + # train the NEL pipe + optimizer = nlp.begin_training() + for i in range(50): + losses = {} + nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses) + assert losses["entity_linker"] < 0.001 + + # test the trained model + predictions = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + for ent in doc.ents: + predictions.append(ent.kb_id_) + assert predictions == GOLD_entities + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + predictions = [] + for text, annotation in TRAIN_DATA: + doc2 = nlp2(text) + for ent in doc2.ents: + predictions.append(ent.kb_id_) + assert predictions == GOLD_entities From c95ce96c448bd3d3e2a167bd7e7eaee1611c11b0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 6 Mar 2020 14:45:02 +0100 Subject: [PATCH 074/187] Update sentence recognizer (#5109) * Update sentence recognizer * rename `sentrec` to `senter` * use `spacy.HashEmbedCNN.v1` by default * update to follow `Tagger` modifications * remove component methods that can be inherited from `Tagger` * add simple initialization and overfitting pipeline tests * Update serialization test for senter --- spacy/cli/train.py | 16 +++--- spacy/language.py | 8 +-- spacy/ml/models/defaults/__init__.py | 8 +-- ...ntrec_defaults.cfg => senter_defaults.cfg} | 4 +- spacy/pipeline/pipes.pyx | 28 ++-------- spacy/tests/pipeline/test_senter.py | 52 +++++++++++++++++++ .../serialize/test_serialize_pipeline.py | 6 +-- 7 files changed, 77 insertions(+), 45 deletions(-) rename spacy/ml/models/defaults/{sentrec_defaults.cfg => senter_defaults.cfg} (75%) create mode 100644 spacy/tests/pipeline/test_senter.py diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 1ca678b85..7eb9bbd3c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -157,6 +157,8 @@ def train( config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": config_loc = default_dir / "textcat_defaults.cfg" + elif pipe == "senter": + config_loc = default_dir / "senter_defaults.cfg" else: raise ValueError(f"Component {pipe} currently not supported.") pipe_cfg = util.load_config(config_loc, create_objects=False) @@ -221,6 +223,8 @@ def train( config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": config_loc = default_dir / "textcat_defaults.cfg" + elif pipe == "senter": + config_loc = default_dir / "senter_defaults.cfg" else: raise ValueError(f"Component {pipe} currently not supported.") pipe_cfg = util.load_config(config_loc, create_objects=False) @@ -559,7 +563,7 @@ def _score_for_model(meta): mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) if "textcat" in pipes: mean_acc.append(acc["textcat_score"]) - if "sentrec" in pipes: + if "senter" in pipes: mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3) return sum(mean_acc) / len(mean_acc) @@ -638,7 +642,7 @@ def _get_metrics(component): return ("tags_acc",) elif component == "ner": return ("ents_f", "ents_p", "ents_r", "ents_per_type") - elif component == "sentrec": + elif component == "senter": return ("sent_f", "sent_p", "sent_r") elif component == "textcat": return ("textcat_score",) @@ -665,9 +669,9 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): elif pipe == "textcat": row_head.extend(["Textcat Loss", "Textcat"]) output_stats.extend(["textcat_loss", "textcat_score"]) - elif pipe == "sentrec": - row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"]) - output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"]) + elif pipe == "senter": + row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"]) + output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"]) row_head.extend(["Token %", "CPU WPS"]) output_stats.extend(["token_acc", "cpu_wps"]) @@ -693,7 +697,7 @@ def _get_progress( scores["ner_loss"] = losses.get("ner", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0) scores["textcat_loss"] = losses.get("textcat", 0.0) - scores["sentrec_loss"] = losses.get("sentrec", 0.0) + scores["senter_loss"] = losses.get("senter", 0.0) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 scores.update(dev_scores) diff --git a/spacy/language.py b/spacy/language.py index 9f5f9d86a..d0077b9d2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -190,7 +190,7 @@ class Language(object): default_textcat_config, default_nel_config, default_morphologizer_config, - default_sentrec_config, + default_senter_config, default_tensorizer_config, default_tok2vec_config, ) @@ -202,7 +202,7 @@ class Language(object): "textcat": default_textcat_config(), "entity_linker": default_nel_config(), "morphologizer": default_morphologizer_config(), - "sentrec": default_sentrec_config(), + "senter": default_senter_config(), "tensorizer": default_tensorizer_config(), "tok2vec": default_tok2vec_config(), } @@ -267,8 +267,8 @@ class Language(object): return self.get_pipe("entity_linker") @property - def sentrec(self): - return self.get_pipe("sentrec") + def senter(self): + return self.get_pipe("senter") @property def matcher(self): diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py index 9af4da87d..d5490fd16 100644 --- a/spacy/ml/models/defaults/__init__.py +++ b/spacy/ml/models/defaults/__init__.py @@ -43,13 +43,13 @@ def default_ner(): return util.load_config(loc, create_objects=True)["model"] -def default_sentrec_config(): - loc = Path(__file__).parent / "sentrec_defaults.cfg" +def default_senter_config(): + loc = Path(__file__).parent / "senter_defaults.cfg" return util.load_config(loc, create_objects=False) -def default_sentrec(): - loc = Path(__file__).parent / "sentrec_defaults.cfg" +def default_senter(): + loc = Path(__file__).parent / "senter_defaults.cfg" return util.load_config(loc, create_objects=True)["model"] diff --git a/spacy/ml/models/defaults/sentrec_defaults.cfg b/spacy/ml/models/defaults/senter_defaults.cfg similarity index 75% rename from spacy/ml/models/defaults/sentrec_defaults.cfg rename to spacy/ml/models/defaults/senter_defaults.cfg index a039a4533..ffa2c6ce2 100644 --- a/spacy/ml/models/defaults/sentrec_defaults.cfg +++ b/spacy/ml/models/defaults/senter_defaults.cfg @@ -2,7 +2,7 @@ @architectures = "spacy.Tagger.v1" [model.tok2vec] -@architectures = "spacy.HashCharEmbedCNN.v1" +@architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = null width = 12 depth = 1 @@ -10,5 +10,3 @@ embed_size = 2000 window_size = 1 maxout_pieces = 2 subword_features = true -nM = 64 -nC = 8 diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4ee470606..51340ee00 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -650,7 +650,7 @@ class Tagger(Pipe): return self -@component("sentrec", assigns=["token.is_sent_start"]) +@component("senter", assigns=["token.is_sent_start"]) class SentenceRecognizer(Tagger): """Pipeline component for sentence segmentation. @@ -670,7 +670,7 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) - def set_annotations(self, docs, batch_tag_ids, **_): + def set_annotations(self, docs, batch_tag_ids): if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -686,24 +686,6 @@ class SentenceRecognizer(Tagger): else: doc.c[j].sent_start = -1 - def update(self, examples, drop=0., sgd=None, losses=None): - examples = Example.to_example_objects(examples) - if losses is not None and self.name not in losses: - losses[self.name] = 0. - - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): - # Handle cases where there are no tokens in any docs. - return - set_dropout_rate(self.model, drop) - tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) - loss, d_tag_scores = self.get_loss(examples, tag_scores) - bp_tag_scores(d_tag_scores) - if sgd is not None: - self.model.finish_update(sgd) - - if losses is not None: - losses[self.name] += loss - def get_loss(self, examples, scores): scores = self.model.ops.flatten(scores) tag_index = range(len(self.labels)) @@ -732,9 +714,9 @@ class SentenceRecognizer(Tagger): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - cdef Vocab vocab = self.vocab self.set_output(len(self.labels)) self.model.initialize() + link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -742,10 +724,6 @@ class SentenceRecognizer(Tagger): def add_label(self, label, values=None): raise NotImplementedError - def use_params(self, params): - with self.model.use_params(params): - yield - def to_bytes(self, exclude=tuple(), **kwargs): serialize = {} serialize["model"] = self.model.to_bytes diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py new file mode 100644 index 000000000..7a929a6a2 --- /dev/null +++ b/spacy/tests/pipeline/test_senter.py @@ -0,0 +1,52 @@ +import pytest + +from spacy import util +from spacy.lang.en import English +from spacy.language import Language +from spacy.tests.util import make_tempdir + + +def test_label_types(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("senter")) + with pytest.raises(NotImplementedError): + nlp.get_pipe("senter").add_label("A") + +SENT_STARTS = [0] * 14 +SENT_STARTS[0] = 1 +SENT_STARTS[5] = 1 +SENT_STARTS[9] = 1 + +TRAIN_DATA = [ + ("I like green eggs. Eat blue ham. I like purple eggs.", {"sent_starts": SENT_STARTS}), + ("She likes purple eggs. They hate ham. You like yellow eggs.", {"sent_starts": SENT_STARTS}), +] + + +def test_overfitting_IO(): + # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly + nlp = English() + senter = nlp.create_pipe("senter") + nlp.add_pipe(senter) + optimizer = nlp.begin_training() + + for i in range(200): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["senter"] < 0.0001 + + # test the trained model + test_text = "I like eggs. There is ham. She likes ham." + doc = nlp(test_text) + gold_sent_starts = [0] * 12 + gold_sent_starts[0] = 1 + gold_sent_starts[4] = 1 + gold_sent_starts[8] = 1 + assert gold_sent_starts == [int(t.is_sent_start) for t in doc] + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + assert gold_sent_starts == [int(t.is_sent_start) for t in doc2] diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index b1070a9e7..a3381cb2f 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -2,7 +2,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger -from spacy.ml.models.defaults import default_textcat, default_sentrec +from spacy.ml.models.defaults import default_textcat, default_senter from ..util import make_tempdir @@ -146,7 +146,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser): def test_serialize_sentencerecognizer(en_vocab): - sr = SentenceRecognizer(en_vocab, default_sentrec()) + sr = SentenceRecognizer(en_vocab, default_senter()) sr_b = sr.to_bytes() - sr_d = SentenceRecognizer(en_vocab, default_sentrec()).from_bytes(sr_b) + sr_d = SentenceRecognizer(en_vocab, default_senter()).from_bytes(sr_b) assert sr.to_bytes() == sr_d.to_bytes() From 5847be6022e615cdea55ca5a7856d203254e7ddf Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 8 Mar 2020 13:23:18 +0100 Subject: [PATCH 075/187] Tok2Vec: extract-embed-encode (#5102) * avoid changing original config * fix elif structure, batch with just int crashes otherwise * tok2vec example with doc2feats, encode and embed architectures * further clean up MultiHashEmbed * further generalize Tok2Vec to work with extract-embed-encode parts * avoid initializing the charembed layer with Docs (for now ?) * small fixes for bilstm config (still does not run) * rename to core layer * move new configs * walk model to set nI instead of using core ref * fix senter overfitting test to be more similar to the training data (avoid flakey behaviour) --- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 +- .../tok2vec-ner/charembed_tok2vec.cfg | 65 ++++++ .../tok2vec-ner/multihashembed_tok2vec.cfg | 65 ++++++ spacy/language.py | 9 +- spacy/ml/_character_embed.py | 2 +- spacy/ml/models/tok2vec.py | 199 +++++++----------- spacy/ml/tok2vec.py | 0 spacy/pipeline/tok2vec.py | 7 +- spacy/tests/pipeline/test_senter.py | 12 +- spacy/util.py | 7 +- 10 files changed, 227 insertions(+), 141 deletions(-) create mode 100644 examples/experiments/tok2vec-ner/charembed_tok2vec.cfg create mode 100644 examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg delete mode 100644 spacy/ml/tok2vec.py diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 4f1a915c5..b6b4e82b6 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -62,4 +62,4 @@ width = 96 depth = 4 embed_size = 2000 subword_features = true -char_embed = false +maxout_pieces = 3 diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg new file mode 100644 index 000000000..b8219ad10 --- /dev/null +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -0,0 +1,65 @@ +[training] +use_gpu = -1 +limit = 0 +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +scores = ["ents_f"] +score_weights = {"ents_f": 1} +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +batch_size = 25 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = null + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[nlp.pipeline.tok2vec.model.extract] +@architectures = "spacy.CharacterEmbed.v1" +width = 96 +nM = 64 +nC = 8 +rows = 2000 +columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] + +[nlp.pipeline.tok2vec.model.extract.features] +@architectures = "spacy.Doc2Feats.v1" +columns = ${nlp.pipeline.tok2vec.model.extract:columns} + +[nlp.pipeline.tok2vec.model.embed] +@architectures = "spacy.LayerNormalizedMaxout.v1" +width = ${nlp.pipeline.tok2vec.model.extract:width} +maxout_pieces = 4 + +[nlp.pipeline.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${nlp.pipeline.tok2vec.model.extract:width} +window_size = 1 +maxout_pieces = 2 +depth = 2 + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model.extract:width} diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg new file mode 100644 index 000000000..4678a7d6b --- /dev/null +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -0,0 +1,65 @@ +[training] +use_gpu = -1 +limit = 0 +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +scores = ["ents_f"] +score_weights = {"ents_f": 1} +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +batch_size = 25 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = null + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[nlp.pipeline.tok2vec.model.extract] +@architectures = "spacy.Doc2Feats.v1" +columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] + +[nlp.pipeline.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +columns = ${nlp.pipeline.tok2vec.model.extract:columns} +width = 96 +rows = 2000 +use_subwords = true +pretrained_vectors = null + +[nlp.pipeline.tok2vec.model.embed.mix] +@architectures = "spacy.LayerNormalizedMaxout.v1" +width = ${nlp.pipeline.tok2vec.model.embed:width} +maxout_pieces = 3 + +[nlp.pipeline.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${nlp.pipeline.tok2vec.model.embed:width} +window_size = 1 +maxout_pieces = 3 +depth = 2 + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model.embed:width} diff --git a/spacy/language.py b/spacy/language.py index d0077b9d2..20e29c829 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -337,13 +337,14 @@ class Language(object): default_config = self.defaults.get(name, None) # transform the model's config to an actual Model + factory_cfg = dict(config) model_cfg = None - if "model" in config: - model_cfg = config["model"] + if "model" in factory_cfg: + model_cfg = factory_cfg["model"] if not isinstance(model_cfg, dict): warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) model_cfg = None - del config["model"] + del factory_cfg["model"] if model_cfg is None and default_config is not None: warnings.warn(Warnings.W098.format(name=name)) model_cfg = default_config["model"] @@ -353,7 +354,7 @@ class Language(object): model = registry.make_from_config({"model": model_cfg}, validate=True)[ "model" ] - return factory(self, model, **config) + return factory(self, model, **factory_cfg) def add_pipe( self, component, name=None, before=None, after=None, first=None, last=None diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index b366f67c6..f4890144a 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -21,7 +21,7 @@ def init(model, X=None, Y=None): def forward(model, docs, is_train): - if not docs: + if docs is None: return [] ids = [] output = [] diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 0d33d010d..d1a98c080 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -4,7 +4,7 @@ from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM from thinc.api import residual, LayerNorm, FeatureExtractor, Mish from ... import util -from ...util import registry, make_layer +from ...util import registry from ...ml import _character_embed from ...pipeline.tok2vec import Tok2VecListener from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE @@ -23,15 +23,14 @@ def get_vocab_vectors(name): @registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec(config): - doc2feats = make_layer(config["@doc2feats"]) - embed = make_layer(config["@embed"]) - encode = make_layer(config["@encode"]) +def Tok2Vec(extract, embed, encode): field_size = 0 - if encode.has_attr("receptive_field"): + if encode.attrs.get("receptive_field", None): field_size = encode.attrs["receptive_field"] - tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) - tok2vec.attrs["cfg"] = config + with Model.define_operators({">>": chain, "|": concatenate}): + if extract.has_dim("nO"): + _set_dims(embed, "nI", extract.get_dim("nO")) + tok2vec = extract >> with_array(embed >> encode, pad=field_size) tok2vec.set_dim("nO", encode.get_dim("nO")) tok2vec.set_ref("embed", embed) tok2vec.set_ref("encode", encode) @@ -39,8 +38,7 @@ def Tok2Vec(config): @registry.architectures.register("spacy.Doc2Feats.v1") -def Doc2Feats(config): - columns = config["columns"] +def Doc2Feats(columns): return FeatureExtractor(columns) @@ -79,8 +77,8 @@ def hash_charembed_cnn( maxout_pieces, window_size, subword_features, - nM=0, - nC=0, + nM, + nC, ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -100,7 +98,7 @@ def hash_charembed_cnn( @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features + pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -109,7 +107,7 @@ def hash_embed_bilstm_v1( pretrained_vectors=pretrained_vectors, bilstm_depth=depth, conv_depth=0, - maxout_pieces=0, + maxout_pieces=maxout_pieces, window_size=1, subword_features=subword_features, char_embed=False, @@ -120,7 +118,7 @@ def hash_embed_bilstm_v1( @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 + pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -129,7 +127,7 @@ def hash_char_embed_bilstm_v1( pretrained_vectors=pretrained_vectors, bilstm_depth=depth, conv_depth=0, - maxout_pieces=0, + maxout_pieces=maxout_pieces, window_size=1, subword_features=subword_features, char_embed=True, @@ -138,104 +136,99 @@ def hash_char_embed_bilstm_v1( ) -@registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(config): - # For backwards compatibility with models before the architecture registry, - # we have to be careful to get exactly the same model structure. One subtle - # trick is that when we define concatenation with the operator, the operator - # is actually binary associative. So when we write (a | b | c), we're actually - # getting concatenate(concatenate(a, b), c). That's why the implementation - # is a bit ugly here. - cols = config["columns"] - width = config["width"] - rows = config["rows"] +@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") +def LayerNormalizedMaxout(width, maxout_pieces): + return Maxout( + nO=width, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ) - norm = HashEmbed(width, rows, column=cols.index("NORM")) - if config["use_subwords"]: - prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX")) - suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX")) - shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE")) - if config.get("@pretrained_vectors"): - glove = make_layer(config["@pretrained_vectors"]) - mix = make_layer(config["@mix"]) + +@registry.architectures.register("spacy.MultiHashEmbed.v1") +def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) + if use_subwords: + prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX")) + suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX")) + shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE")) + + if pretrained_vectors: + glove = StaticVectors( + vectors=pretrained_vectors.data, + nO=width, + column=columns.index(ID), + dropout=0.0, + ) with Model.define_operators({">>": chain, "|": concatenate}): - if config["use_subwords"] and config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 5) - layer = uniqued( - (glove | norm | prefix | suffix | shape) >> mix, - column=cols.index("ORTH"), - ) - elif config["use_subwords"]: - mix._layers[0].set_dim("nI", width * 4) - layer = uniqued( - (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") - ) - elif config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 2) - layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH")) + if not use_subwords and not pretrained_vectors: + embed_layer = norm else: - layer = norm - layer.attrs["cfg"] = config - return layer + if use_subwords and pretrained_vectors: + nr_columns = 5 + concat_columns = glove | norm | prefix | suffix | shape + elif use_subwords: + nr_columns = 4 + concat_columns = norm | prefix | suffix | shape + else: + nr_columns = 2 + concat_columns = glove | norm + _set_dims(mix, "nI", width * nr_columns) + embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) + + return embed_layer + + +def _set_dims(model, name, value): + # Loop through the model to set a specific dimension if its unset on any layer. + for node in model.walk(): + if node.has_dim(name) is None: + node.set_dim(name, value) @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(config): - width = config["width"] - chars = config["chars"] - - chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) - other_tables = make_layer(config["@embed_features"]) - mix = make_layer(config["@mix"]) - - model = chain(concatenate(chr_embed, other_tables), mix) - model.attrs["cfg"] = config - return model +def CharacterEmbed(columns, width, rows, nM, nC, features): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) + chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) + with Model.define_operators({">>": chain, "|": concatenate}): + embed_layer = chr_embed | features >> with_array(norm) + embed_layer.set_dim("nO", nM * nC + width) + return embed_layer @registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(config): - nO = config["width"] - nW = config["window_size"] - nP = config["pieces"] - depth = config["depth"] - - cnn = ( - expand_window(window_size=nW), - Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), +def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): + cnn = chain( + expand_window(window_size=window_size), + Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), ) model = clone(residual(cnn), depth) - model.set_dim("nO", nO) - model.attrs["receptive_field"] = nW * depth + model.set_dim("nO", width) + model.attrs["receptive_field"] = window_size * depth return model @registry.architectures.register("spacy.MishWindowEncoder.v1") -def MishWindowEncoder(config): - nO = config["width"] - nW = config["window_size"] - depth = config["depth"] - +def MishWindowEncoder(width, window_size, depth): cnn = chain( - expand_window(window_size=nW), - Mish(nO=nO, nI=nO * ((nW * 2) + 1)), - LayerNorm(nO), + expand_window(window_size=window_size), + Mish(nO=width, nI=width * ((window_size * 2) + 1)), + LayerNorm(width), ) model = clone(residual(cnn), depth) - model.set_dim("nO", nO) + model.set_dim("nO", width) return model @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def TorchBiLSTMEncoder(config): +def TorchBiLSTMEncoder(width, depth): import torch.nn # TODO FIX from thinc.api import PyTorchRNNWrapper - width = config["width"] - depth = config["depth"] if depth == 0: return noop() return with_padded( @@ -243,40 +236,6 @@ def TorchBiLSTMEncoder(config): ) -# TODO: update -_EXAMPLE_CONFIG = { - "@doc2feats": { - "arch": "Doc2Feats", - "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]}, - }, - "@embed": { - "arch": "spacy.MultiHashEmbed.v1", - "config": { - "width": 96, - "rows": 2000, - "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"], - "use_subwords": True, - "@pretrained_vectors": { - "arch": "TransformedStaticVectors", - "config": { - "vectors_name": "en_vectors_web_lg.vectors", - "width": 96, - "column": 0, - }, - }, - "@mix": { - "arch": "LayerNormalizedMaxout", - "config": {"width": 96, "pieces": 3}, - }, - }, - }, - "@encode": { - "arch": "MaxoutWindowEncode", - "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3}, - }, -} - - def build_Tok2Vec_model( width, embed_size, diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 2fee6881a..4623f99b0 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -131,9 +131,10 @@ class Tok2Vec(Pipe): get_examples (function): Function returning example training data. pipeline (list): The pipeline the model is part of. """ - # TODO: use examples instead ? - docs = [Doc(Vocab(), words=["hello"])] - self.model.initialize(X=docs) + # TODO: charembed does not play nicely with dim inference yet + # docs = [Doc(Vocab(), words=["hello"])] + # self.model.initialize(X=docs) + self.model.initialize() link_vectors_to_models(self.vocab) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 7a929a6a2..411768e5f 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -36,17 +36,17 @@ def test_overfitting_IO(): assert losses["senter"] < 0.0001 # test the trained model - test_text = "I like eggs. There is ham. She likes ham." + test_text = "I like purple eggs. They eat ham. You like yellow eggs." doc = nlp(test_text) - gold_sent_starts = [0] * 12 + gold_sent_starts = [0] * 14 gold_sent_starts[0] = 1 - gold_sent_starts[4] = 1 - gold_sent_starts[8] = 1 - assert gold_sent_starts == [int(t.is_sent_start) for t in doc] + gold_sent_starts[5] = 1 + gold_sent_starts[9] = 1 + assert [int(t.is_sent_start) for t in doc] == gold_sent_starts # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert gold_sent_starts == [int(t.is_sent_start) for t in doc2] + assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts diff --git a/spacy/util.py b/spacy/util.py index 216158e52..37649c5e6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -79,11 +79,6 @@ def set_lang_class(name, cls): registry.languages.register(name, func=cls) -def make_layer(arch_config): - arch_func = registry.architectures.get(arch_config["arch"]) - return arch_func(arch_config["config"]) - - def ensure_path(path): """Ensure string is converted to a Path. @@ -563,7 +558,7 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len): """Create minibatches of a given number of words.""" if isinstance(size, int): size_ = itertools.repeat(size) - if isinstance(size, List): + elif isinstance(size, List): size_ = iter(size) else: size_ = size From 59000ee21dcacb091fd3493bdfe4ea57e664e110 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 13 Mar 2020 16:07:56 +0100 Subject: [PATCH 076/187] fix serialization of empty doc + unit test --- spacy/tests/regression/test_issue5141.py | 11 +++++++++++ spacy/tokens/_serialize.py | 7 +++++-- 2 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/regression/test_issue5141.py diff --git a/spacy/tests/regression/test_issue5141.py b/spacy/tests/regression/test_issue5141.py new file mode 100644 index 000000000..845454583 --- /dev/null +++ b/spacy/tests/regression/test_issue5141.py @@ -0,0 +1,11 @@ +from spacy.tokens import DocBin + + +def test_issue5141(en_vocab): + """ Ensure an empty DocBin does not crash on serialization """ + doc_bin = DocBin(attrs=["DEP", "HEAD"]) + assert list(doc_bin.get_docs(en_vocab)) == [] + doc_bin_bytes = doc_bin.to_bytes() + + doc_bin_2 = DocBin().from_bytes(doc_bin_bytes) + assert list(doc_bin_2.get_docs(en_vocab)) == [] diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 65b70d1b3..d3f49550c 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -135,10 +135,13 @@ class DocBin(object): for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape # this should never happen lengths = [len(tokens) for tokens in self.tokens] + tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([]) + spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) + msg = { "attrs": self.attrs, - "tokens": numpy.vstack(self.tokens).tobytes("C"), - "spaces": numpy.vstack(self.spaces).tobytes("C"), + "tokens": tokens.tobytes("C"), + "spaces": spaces.tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "strings": list(self.strings), "cats": self.cats, From fba219f73765725afb7468c3c1b114df3e1a27f4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 16 Mar 2020 08:31:36 +0100 Subject: [PATCH 077/187] remove unnecessary itertools call --- spacy/language.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 20e29c829..6b3957deb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -778,8 +778,6 @@ class Language(object): DOCS: https://spacy.io/api/language#pipe """ - # raw_texts will be used later to stop iterator. - texts, raw_texts = itertools.tee(texts) if n_threads != -1: warnings.warn(Warnings.W016, DeprecationWarning) if n_process == -1: From 02d87a8b2b7db3cfbe2649daf87ed61450fc7fbe Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 19 Mar 2020 10:30:20 +0100 Subject: [PATCH 078/187] fix showing dep arcs in streamlit script --- examples/streamlit_spacy.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py index a2da123c2..2b527b3df 100644 --- a/examples/streamlit_spacy.py +++ b/examples/streamlit_spacy.py @@ -1,7 +1,7 @@ # coding: utf-8 """ Example of a Streamlit app for an interactive spaCy model visualizer. You can -either download the script, or point streamlit run to the raw URL of this +either download the script, or point `streamlit run` to the raw URL of this file. For more details, see https://streamlit.io. Installation: @@ -15,6 +15,8 @@ streamlit run streamlit_spacy.py """ from __future__ import unicode_literals +import base64 + import streamlit as st import spacy from spacy import displacy @@ -54,6 +56,14 @@ model_load_state.empty() text = st.text_area("Text to analyze", DEFAULT_TEXT) doc = process_text(spacy_model, text) + +def render_svg(svg): + """Renders the given svg string.""" + b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8") + html = r'' % b64 + st.write(html, unsafe_allow_html=True) + + if "parser" in nlp.pipe_names: st.header("Dependency Parse & Part-of-speech tags") st.sidebar.header("Dependency Parse") @@ -68,12 +78,14 @@ if "parser" in nlp.pipe_names: } docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] for sent in docs: - html = displacy.render(sent, options=options) + html = displacy.render(sent, options=options, style="dep") # Double newlines seem to mess with the rendering html = html.replace("\n\n", "\n") if split_sents and len(docs) > 1: st.markdown(f"> {sent.text}") - st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) + render_svg(html) + # this didn't show the dep arc labels properly, cf #5089 + # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) if "ner" in nlp.pipe_names: st.header("Named Entities") From fcac1ace7839eb49721a4636b4f3687781d5a4ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 23 Mar 2020 22:55:47 +0100 Subject: [PATCH 079/187] Update macOS image on Azure Pipelines --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d34da39f7..f93dffaed 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -42,7 +42,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.6' Python36Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.6' # Don't test on 3.7 for now to speed up builds # Python37Linux: @@ -52,7 +52,7 @@ jobs: # imageName: 'vs2017-win2016' # python.version: '3.7' # Python37Mac: - # imageName: 'macos-10.13' + # imageName: 'macos-10.14' # python.version: '3.7' Python38Linux: imageName: 'ubuntu-16.04' @@ -61,7 +61,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.8' Python38Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.8' maxParallel: 4 pool: From 218e1706ac97f276f8226531c0c942ed660b953e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 25 Mar 2020 10:20:11 +0100 Subject: [PATCH 080/187] Bugfix linking vectors (#5196) * restore call to _load_vectors * bump to thinc 8.0.0a3 * bump to 3.0.0.dev4 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/about.py | 2 +- spacy/cli/train.py | 2 ++ spacy/syntax/_parser_model.pyx | 2 +- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ee28d5d42..9440c2d44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a1", + "thinc==8.0.0a3", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 09998cdc9..73e595daf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a1 +thinc==8.0.0a3 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 7b3a468b6..d7d2be935 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,13 +36,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a1 + thinc==8.0.0a3 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a1 + thinc==8.0.0a3 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 diff --git a/spacy/about.py b/spacy/about.py index 6a3c680ab..0c0a2d002 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev3" +__version__ = "3.0.0.dev4" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7eb9bbd3c..a40fdadb4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -143,6 +143,7 @@ def train( ) if vectors: msg.text(f"Loading vectors from model '{vectors}'") + _load_vectors(nlp, vectors) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: @@ -210,6 +211,7 @@ def train( if vectors: msg.text(f"Loading vectors from model '{vectors}'") + _load_vectors(nlp, vectors) for pipe in pipeline: # first, create the model. diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index e36a2a28b..4a1014a09 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -250,7 +250,7 @@ class ParserModel(Model): nI = smaller.get_dim("nI") with use_ops('numpy'): larger = Linear(nO=new_nO, nI=nI) - larger._init = smaller._init + larger.init = smaller.init # it could be that the model is not initialized yet, then skip this bit if nI: larger_W = larger.ops.alloc2f(new_nO, nI) From 70ee4ef4fdcbdb659fa84b7356c08dd910c44968 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Mar 2020 13:47:31 +0100 Subject: [PATCH 081/187] Fix small errors --- spacy/morphology.pyx | 4 ++-- spacy/tokens/doc.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 946da141d..0b53b124c 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ def _normalize_props(props): elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value else: - warnings.warn(Warnings.W028.format(feature={key: value})) + warnings.warn(Warnings.W029.format(feature={key: value})) return out @@ -112,7 +112,7 @@ cdef class Morphology: return tag_ptr.key features = self.feats_to_dict(features) if not isinstance(features, dict): - warnings.warn(Warnings.W028.format(feature=features)) + warnings.warn(Warnings.W029.format(feature=features)) features = {} features = _normalize_props(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 6a139dd86..a6b1b171b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -781,7 +781,7 @@ cdef class Doc: attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] if array.dtype != numpy.uint64: - user_warning(Warnings.W028.format(type=array.dtype)) + warnings.warn(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) From e7341db5dc16102625d9f0f90545596145968920 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Mar 2020 14:05:40 +0100 Subject: [PATCH 082/187] Add sent_start to pattern schema --- spacy/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/schemas.py b/spacy/schemas.py index 2268bf100..3b6313db8 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -119,6 +119,7 @@ class TokenPattern(BaseModel): is_currency: Optional[StrictBool] = None is_stop: Optional[StrictBool] = None is_sent_start: Optional[StrictBool] = None + sent_start: Optional[StrictBool] = None like_num: Optional[StrictBool] = None like_url: Optional[StrictBool] = None like_email: Optional[StrictBool] = None From 7453df79d166b0441becc0296de2b691dd7afa06 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Mar 2020 14:09:02 +0100 Subject: [PATCH 083/187] Fix argument --- spacy/language.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index ce9412d85..5343df4b7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1150,7 +1150,7 @@ def _pipe(examples, proc, kwargs): yield ex -def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors): +def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): """Worker for Language.pipe receiver (multiprocessing.Connection): Pipe to receive text. Usually @@ -1158,7 +1158,6 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors): sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` underscore_state (tuple): The data in the Underscore class of the parent - vectors (dict): The global vectors data, copied from the parent """ Underscore.load_state(underscore_state) while True: From f12a46472c6d5f5cf05a2576ccffe1ca82d2f37e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Mar 2020 15:18:32 +0100 Subject: [PATCH 084/187] Remove unicode declarations --- spacy/lang/eu/__init__.py | 3 --- spacy/lang/eu/examples.py | 3 --- spacy/lang/eu/lex_attrs.py | 3 --- spacy/lang/eu/punctuation.py | 3 --- spacy/lang/eu/stop_words.py | 3 --- spacy/lang/eu/tag_map.py | 3 --- spacy/lang/lij/__init__.py | 3 --- spacy/lang/lij/examples.py | 4 ---- spacy/lang/lij/punctuation.py | 3 --- spacy/lang/lij/stop_words.py | 4 ---- spacy/lang/lij/tokenizer_exceptions.py | 2 -- spacy/lang/lt/punctuation.py | 3 --- spacy/lang/ro/punctuation.py | 3 --- spacy/tests/lang/eu/test_text.py | 3 --- spacy/tests/regression/test_issue4725.py | 3 --- spacy/tests/regression/test_issue4903.py | 3 --- spacy/tests/regression/test_issue5048.py | 3 --- spacy/tests/regression/test_issue5082.py | 3 --- 18 files changed, 55 deletions(-) diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py index 4f3338c1d..352eb1548 100644 --- a/spacy/lang/eu/__init__.py +++ b/spacy/lang/eu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py index 463494abd..3b9ef71b6 100644 --- a/spacy/lang/eu/examples.py +++ b/spacy/lang/eu/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py index 19b75c111..a3ab018ee 100644 --- a/spacy/lang/eu/lex_attrs.py +++ b/spacy/lang/eu/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM # Source http://mylanguages.org/basque_numbers.php diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py index b8b1a1c83..5d35d0a25 100644 --- a/spacy/lang/eu/punctuation.py +++ b/spacy/lang/eu/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py index dda11a7fd..d213b5b81 100644 --- a/spacy/lang/eu/stop_words.py +++ b/spacy/lang/eu/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-eu # https://www.ranks.nl/stopwords/basque # https://www.mustgo.com/worldlanguages/basque/ diff --git a/spacy/lang/eu/tag_map.py b/spacy/lang/eu/tag_map.py index 2499d7e3e..e0940edb7 100644 --- a/spacy/lang/eu/tag_map.py +++ b/spacy/lang/eu/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py index 9b4b29798..a75f081bf 100644 --- a/spacy/lang/lij/__init__.py +++ b/spacy/lang/lij/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py index c4034ae7e..ba7fe43fd 100644 --- a/spacy/lang/lij/examples.py +++ b/spacy/lang/lij/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py index 4439376c8..d50b75589 100644 --- a/spacy/lang/lij/punctuation.py +++ b/spacy/lang/lij/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index ffd53370d..1d6f09d27 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py index 2109add62..2befabca3 100644 --- a/spacy/lang/lij/tokenizer_exceptions.py +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...symbols import ORTH, LEMMA _exc = {} diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py index 5eedc8116..506aa8f32 100644 --- a/spacy/lang/lt/punctuation.py +++ b/spacy/lang/lt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ICONS, LIST_ELLIPSES from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA from ..char_classes import HYPHENS diff --git a/spacy/lang/ro/punctuation.py b/spacy/lang/ro/punctuation.py index 87f9a1248..529e1c977 100644 --- a/spacy/lang/ro/punctuation.py +++ b/spacy/lang/ro/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import itertools from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py index f448a7859..94d5ac91d 100644 --- a/spacy/tests/lang/eu/test_text.py +++ b/spacy/tests/lang/eu/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index 57675a202..624eefb2c 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index d467b1cd6..a3dff16aa 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokens import Span, Doc diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py index 228322493..bc52ae82f 100644 --- a/spacy/tests/regression/test_issue5048.py +++ b/spacy/tests/regression/test_issue5048.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy from spacy.tokens import Doc from spacy.attrs import DEP, POS, TAG diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py index efa5d39f2..52a52b177 100644 --- a/spacy/tests/regression/test_issue5082.py +++ b/spacy/tests/regression/test_issue5082.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy as np from spacy.lang.en import English from spacy.pipeline import EntityRuler From 4fe2299586227496c2cd1c1649158bb0464ab0d7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Mar 2020 20:58:13 +0100 Subject: [PATCH 085/187] xfail hanging test --- spacy/tests/regression/test_issue4725.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index 624eefb2c..a5087f0b2 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -4,6 +4,7 @@ from spacy.lang.en import English from spacy.vocab import Vocab +@pytest.mark.xfail(reason="currently hangs") def test_issue4725(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors vocab = Vocab(vectors_name="test_vocab_add_vector") From ee4bb0e3b6247429e05bf0e09599b98ed58c269a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Mar 2020 21:44:18 +0100 Subject: [PATCH 086/187] Fix import --- spacy/tests/regression/test_issue4725.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index a5087f0b2..720da93e3 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -1,3 +1,4 @@ +import pytest import numpy from spacy.lang.en import English From 92b9b631ef2efd834cfde471a1f95fe7a3707336 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Mar 2020 10:51:32 +0100 Subject: [PATCH 087/187] xfail -> skip --- spacy/tests/regression/test_issue4725.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index 720da93e3..ca6c3f767 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -5,7 +5,7 @@ from spacy.lang.en import English from spacy.vocab import Vocab -@pytest.mark.xfail(reason="currently hangs") +@pytest.mark.skip(reason="currently hangs") def test_issue4725(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors vocab = Vocab(vectors_name="test_vocab_add_vector") From 9b412516e7ccbd3cfd9010465fc0d4220fff7fc9 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 27 Mar 2020 19:35:26 +0100 Subject: [PATCH 088/187] Fixing pickling of the parser (#5218) * fix __reduce__ for pickling parser * setting the move object as 'state' during pickling * unskip test_issue4725 - works again --- spacy/pipeline/pipes.pyx | 17 ++++++++++++++--- spacy/syntax/nn_parser.pyx | 8 +++++++- spacy/tests/regression/test_issue4725.py | 1 - website/docs/usage/saving-loading.md | 2 +- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 658de8a1f..9ea2507cb 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1173,7 +1173,13 @@ cdef class DependencyParser(Parser): tok2vec=tok2vec, sgd=sgd) def __reduce__(self): - return (DependencyParser, (self.vocab, self.moves, self.model), None, None) + return (DependencyParser, (self.vocab, self.model), self.moves) + + def __getstate__(self): + return self.moves + + def __setstate__(self, moves): + self.moves = moves @property def labels(self): @@ -1214,8 +1220,13 @@ cdef class EntityRecognizer(Parser): tok2vec=tok2vec) def __reduce__(self): - return (EntityRecognizer, (self.vocab, self.moves, self.model), - None, None) + return (EntityRecognizer, (self.vocab, self.model), self.moves) + + def __getstate__(self): + return self.moves + + def __setstate__(self, moves): + self.moves = moves @property def labels(self): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 2ba13507f..f480e3528 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -79,7 +79,13 @@ cdef class Parser: return cls(nlp.vocab, model, **cfg) def __reduce__(self): - return (Parser, (self.vocab, self.moves, self.model), None, None) + return (Parser, (self.vocab, self.model), self.moves) + + def __getstate__(self): + return self.moves + + def __setstate__(self, moves): + self.moves = moves @property def move_names(self): diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index ca6c3f767..967db5d67 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -5,7 +5,6 @@ from spacy.lang.en import English from spacy.vocab import Vocab -@pytest.mark.skip(reason="currently hangs") def test_issue4725(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors vocab = Vocab(vectors_name="test_vocab_add_vector") diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 8e2c30d82..058204a5d 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -131,7 +131,7 @@ shared vocab it depends on. If you need to pickle multiple objects, try to pickle them **together** instead of separately. For instance, instead of pickling all pipeline components, pickle the entire pipeline once. And instead of pickling several `Doc` objects -separately, pickle a list of `Doc` objects. Since the all share a reference to +separately, pickle a list of `Doc` objects. Since they all share a reference to the _same_ `Vocab` object, it will only be included once. ```python From 1f9852abc30fd61cdfd0edc494f9ba32ae404b31 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 28 Mar 2020 23:09:35 +0100 Subject: [PATCH 089/187] Fix parser @ GPU (#5210) * ensure self.bias is numpy array in parser model * 2 more little bug fixes for parser on GPU * removing testing GPU statement * remove commented code --- spacy/ml/_layers.py | 8 ++++++-- spacy/syntax/_parser_model.pyx | 5 +---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py index 7e9150d8b..a752ef49a 100644 --- a/spacy/ml/_layers.py +++ b/spacy/ml/_layers.py @@ -79,7 +79,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # for b in range(nB): # for f in range(nF): # if ids[b, f] < 0: - # d_padding[0, f] += dY[b] + # d_pad[0, f] += dY[b] # # Which can be rewritten as: # @@ -88,9 +88,13 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # # I don't know how to avoid the loop without building a whole array :(. # Cursed numpy. + # + # Note by Sofie: rewritten to longer loop because "CuPy only supports slices that consist of one boolean array." d_pad = model.ops.alloc((1, nF, nO, nP)) for b in range(nB): - d_pad[0, ids[b] < 0] += dY[b] + for f in range(nF): + if ids[b, f] < 0: + d_pad[0, f] += dY[b] return d_pad diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 4a1014a09..4f4e5e4b0 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -371,8 +371,6 @@ class ParserStepModel(Model): self.ops.scatter_add(d_tokvecs, ids, d_state_features) # Padded -- see update() - if isinstance(self.ops, CupyOps): - d_tokvecs = self.ops.to_numpy(d_tokvecs) self.bp_tokvecs(d_tokvecs[:-1]) return d_tokvecs @@ -445,8 +443,7 @@ cdef class precompute_hiddens: else: cached = gpu_cached if not isinstance(lower_model.get_param("b"), numpy.ndarray): - # self.bias = lower_model.get_param("b").get(stream=cuda_stream) ??? - self.bias = lower_model.get_param("b") + self.bias = lower_model.get_param("b").get(stream=cuda_stream) else: self.bias = lower_model.get_param("b") self.nF = cached.shape[1] From d6d95674c15d36afa12b819217a722a3c14a7353 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 29 Mar 2020 13:56:07 +0200 Subject: [PATCH 090/187] bugfix in span similarity (#5155) * bugfix in span similarity * also rewrite doc.pyx for clarity * formatting --- spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++ spacy/tokens/doc.pyx | 15 ++++++++------- spacy/tokens/span.pyx | 6 ++++-- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 spacy/tests/regression/test_issue5152.py diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py new file mode 100644 index 000000000..a9a57746d --- /dev/null +++ b/spacy/tests/regression/test_issue5152.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + assert span_2.similarity(span_3) < 1.0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a6b1b171b..0716b2b3d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -380,13 +380,14 @@ cdef class Doc: if isinstance(other, (Lexeme, Token)) and self.length == 1: if self.c[0].lex.orth == other.orth: return 1.0 - elif isinstance(other, (Span, Doc)): - if len(self) == len(other): - for i in range(self.length): - if self[i].orth != other[i].orth: - break - else: - return 1.0 + elif isinstance(other, (Span, Doc)) and len(self) == len(other): + similar = True + for i in range(self.length): + if self[i].orth != other[i].orth: + similar = False + break + if similar: + return 1.0 if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 53d1b9826..66e8d8c3e 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -320,11 +320,13 @@ cdef class Span: if len(self) == 1 and hasattr(other, "orth"): if self[0].orth == other.orth: return 1.0 - elif hasattr(other, "__len__") and len(self) == len(other): + elif isinstance(other, (Doc, Span)) and len(self) == len(other): + similar = True for i in range(len(self)): if self[i].orth != getattr(other[i], "orth", None): + similar = False break - else: + if similar: return 1.0 if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Span")) From ce0e5380684fd593e2839ad1d954e1218224246c Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 29 Mar 2020 13:57:00 +0200 Subject: [PATCH 091/187] Check whether doc is instantiated in Example.get_gold_parses() (#5167) * Check whether doc is instantiated When creating docs to pair with gold parses, modify test to check whether a doc is unset rather than whether it contains tokens. * Restore test of evaluate on an empty doc * Set a minimal gold.orig for the scorer Without a minimal gold.orig the scorer can't evaluate empty docs. This is the v3 equivalent of #4925. --- spacy/gold.pyx | 7 +++++-- spacy/tests/regression/test_issue4924.py | 3 +-- spacy/tests/test_gold.py | 7 +++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 37d092395..a9156c1a5 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -834,7 +834,7 @@ cdef class Example: if merge: t = self.token_annotation doc = self.doc - if not self.doc: + if self.doc is None: if not vocab: raise ValueError(Errors.E998) doc = Doc(vocab, words=t.words) @@ -993,7 +993,10 @@ cdef class GoldParse: self.links = {} if links is None else dict(links) # avoid allocating memory if the doc does not contain any tokens - if self.length > 0: + if self.length == 0: + # set a minimal orig so that the scorer can score an empty doc + self.orig = TokenAnnotation(ids=[]) + else: if not words: words = [token.text for token in doc] if not tags: diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index 1eb6afcf0..b240f6d4a 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -5,5 +5,4 @@ from spacy.language import Language def test_issue4924(): nlp = Language() docs_golds = [("", {})] - with pytest.raises(ValueError): - nlp.evaluate(docs_golds) + nlp.evaluate(docs_golds) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 7fe8aab73..0754fb5bc 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -480,3 +480,10 @@ def test_tuples_to_example(merged_dict): assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] assert ex_dict["doc_annotation"]["cats"] == cats + + +def test_empty_example_goldparse(): + nlp = English() + doc = nlp("") + example = Example(doc=doc) + assert len(example.get_gold_parses()) == 1 From 311133e579158a26f34379e44054762dac8d93fc Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 29 Mar 2020 19:40:36 +0200 Subject: [PATCH 092/187] Train textcat with config (#5143) * bring back default build_text_classifier method * remove _set_dims_ hack in favor of proper dim inference * add tok2vec initialize to unit test * small fixes * add unit test for various textcat config settings * logistic output layer does not have nO * fix window_size setting * proper fix * fix W initialization * Update textcat training example * Use ml_datasets * Convert training data to `Example` format * Use `n_texts` to set proportionate dev size * fix _init renaming on latest thinc * avoid setting a non-existing dim * update to thinc==8.0.0a2 * add BOW and CNN defaults for easy testing * various experiments with train_textcat script, fix softmax activation in textcat bow * allow textcat train script to work on other datasets as well * have dataset as a parameter * train textcat from config, with example config * add config for training textcat * formatting * fix exclusive_classes * fixing BOW for GPU * bump thinc to 8.0.0a3 (not published yet so CI will fail) * add in link_vectors_to_models which got deleted Co-authored-by: Adriane Boyd --- examples/training/train_textcat.py | 100 ++++++++++------ examples/training/train_textcat_config.cfg | 19 +++ .../{_layers.py => _precomputable_affine.py} | 0 spacy/ml/extract_ngrams.py | 20 ++-- .../models/defaults/textcat_bow_defaults.cfg | 5 + .../models/defaults/textcat_cnn_defaults.cfg | 13 ++ spacy/ml/models/defaults/textcat_defaults.cfg | 12 +- spacy/ml/models/parser.py | 2 +- spacy/ml/models/textcat.py | 112 ++++++++++++++++-- spacy/ml/models/tok2vec.py | 12 +- spacy/ml/spacy_vectors.py | 27 +++++ spacy/pipeline/pipes.pyx | 4 +- spacy/pipeline/tok2vec.py | 6 +- spacy/tests/pipeline/test_textcat.py | 34 +++++- spacy/tests/test_misc.py | 3 +- spacy/tests/test_tok2vec.py | 15 +-- spacy/tests/util.py | 15 +++ 17 files changed, 301 insertions(+), 98 deletions(-) create mode 100644 examples/training/train_textcat_config.cfg rename spacy/ml/{_layers.py => _precomputable_affine.py} (100%) create mode 100644 spacy/ml/models/defaults/textcat_bow_defaults.cfg create mode 100644 spacy/ml/models/defaults/textcat_cnn_defaults.cfg create mode 100644 spacy/ml/spacy_vectors.py diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 50c852ac1..dfb95b038 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -2,70 +2,71 @@ # coding: utf8 """Train a convolutional neural network text classifier on the IMDB dataset, using the TextCategorizer component. The dataset will be loaded -automatically via Thinc's built-in dataset loader. The model is added to +automatically via the package `ml_datasets`. The model is added to spacy.pipeline, and predictions are available via `doc.cats`. For more details, see the documentation: * Training: https://spacy.io/usage/training -Compatible with: spaCy v2.0.0+ +Compatible with: spaCy v3.0.0+ """ from __future__ import unicode_literals, print_function -import ml_datasets import plac import random from pathlib import Path +from ml_datasets import loaders import spacy +from spacy import util from spacy.util import minibatch, compounding +from spacy.gold import Example, GoldParse @plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + config_path=("Path to config file", "positional", None, Path), output_dir=("Optional output directory", "option", "o", Path), n_texts=("Number of texts to train from", "option", "t", int), n_iter=("Number of training iterations", "option", "n", int), init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path), + dataset=("Dataset to train on (default: imdb)", "option", "d", str), + threshold=("Min. number of instances for a given label (default 20)", "option", "m", int) ) -def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None): +def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20): + if not config_path or not config_path.exists(): + raise ValueError(f"Config file not found at {config_path}") + + spacy.util.fix_random_seed() if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") + print(f"Loading nlp model from {config_path}") + nlp_config = util.load_config(config_path, create_objects=False)["nlp"] + nlp = util.load_model_from_config(nlp_config) - # add the text classifier to the pipeline if it doesn't exist - # nlp.create_pipe works for built-ins that are registered with spaCy + # ensure the nlp object was defined with a textcat component if "textcat" not in nlp.pipe_names: - textcat = nlp.create_pipe( - "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"} - ) - nlp.add_pipe(textcat, last=True) - # otherwise, get it, so we can add labels to it - else: - textcat = nlp.get_pipe("textcat") + raise ValueError(f"The nlp definition in the config does not contain a textcat component") - # add label to text classifier - textcat.add_label("POSITIVE") - textcat.add_label("NEGATIVE") + textcat = nlp.get_pipe("textcat") - # load the IMDB dataset - print("Loading IMDB data...") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data() - train_texts = train_texts[:n_texts] - train_cats = train_cats[:n_texts] + # load the dataset + print(f"Loading dataset {dataset} ...") + (train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts) print( "Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts) ) ) - train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) + train_examples = [] + for text, cats in zip(train_texts, train_cats): + doc = nlp.make_doc(text) + gold = GoldParse(doc, cats=cats) + for cat in cats: + textcat.add_label(cat) + ex = Example.from_gold(gold, doc=doc) + train_examples.append(ex) # get names of other pipes to disable them during training pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"] @@ -81,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch - random.shuffle(train_data) - batches = minibatch(train_data, size=batch_sizes) + random.shuffle(train_examples) + batches = minibatch(train_examples, size=batch_sizes) for batch in batches: nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): @@ -97,7 +98,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None ) ) - # test the trained model + # test the trained model (only makes sense for sentiment analysis) test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) @@ -114,14 +115,39 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None print(test_text, doc2.cats) -def load_data(limit=0, split=0.8): - """Load data from the IMDB dataset.""" +def load_data(dataset, threshold, limit=0, split=0.8): + """Load data from the provided dataset.""" # Partition off part of the train data for evaluation - train_data, _ = ml_datasets.imdb() + data_loader = loaders.get(dataset) + train_data, _ = data_loader(limit=int(limit/split)) random.shuffle(train_data) - train_data = train_data[-limit:] texts, labels = zip(*train_data) - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] + + unique_labels = sorted(set([l for label_set in labels for l in label_set])) + print(f"# of unique_labels: {len(unique_labels)}") + + count_values_train = dict() + for text, annot_list in train_data: + for annot in annot_list: + count_values_train[annot] = count_values_train.get(annot, 0) + 1 + for value, count in sorted(count_values_train.items(), key=lambda item: item[1]): + if count < threshold: + unique_labels.remove(value) + + print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}") + + if unique_labels == {0, 1}: + cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] + else: + cats = [] + for y in labels: + if isinstance(y, str): + cats.append({str(label): (label == y) for label in unique_labels}) + elif isinstance(y, set): + cats.append({str(label): (label in y) for label in unique_labels}) + else: + raise ValueError(f"Unrecognised type of labels: {type(y)}") + split = int(len(train_data) * split) return (texts[:split], cats[:split]), (texts[split:], cats[split:]) diff --git a/examples/training/train_textcat_config.cfg b/examples/training/train_textcat_config.cfg new file mode 100644 index 000000000..7c0f36b57 --- /dev/null +++ b/examples/training/train_textcat_config.cfg @@ -0,0 +1,19 @@ +[nlp] +lang = "en" + +[nlp.pipeline.textcat] +factory = "textcat" + +[nlp.pipeline.textcat.model] +@architectures = "spacy.TextCatCNN.v1" +exclusive_classes = false + +[nlp.pipeline.textcat.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/_layers.py b/spacy/ml/_precomputable_affine.py similarity index 100% rename from spacy/ml/_layers.py rename to spacy/ml/_precomputable_affine.py diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index d4195b9a4..f9f691aae 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -11,26 +11,26 @@ def extract_ngrams(ngram_size, attr=LOWER) -> Model: return model -def forward(self, docs, is_train: bool): +def forward(model, docs, is_train: bool): batch_keys = [] batch_vals = [] for doc in docs: - unigrams = doc.to_array([self.attrs["attr"]]) + unigrams = model.ops.asarray(doc.to_array([model.attrs["attr"]])) ngrams = [unigrams] - for n in range(2, self.attrs["ngram_size"] + 1): - ngrams.append(self.ops.ngrams(n, unigrams)) - keys = self.ops.xp.concatenate(ngrams) - keys, vals = self.ops.xp.unique(keys, return_counts=True) + for n in range(2, model.attrs["ngram_size"] + 1): + ngrams.append(model.ops.ngrams(n, unigrams)) + keys = model.ops.xp.concatenate(ngrams) + keys, vals = model.ops.xp.unique(keys, return_counts=True) batch_keys.append(keys) batch_vals.append(vals) # The dtype here matches what thinc is expecting -- which differs per # platform (by int definition). This should be fixed once the problem # is fixed on Thinc's side. - lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_) - batch_keys = self.ops.xp.concatenate(batch_keys) - batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") + lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_) + batch_keys = model.ops.xp.concatenate(batch_keys) + batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f") def backprop(dY): - return dY + return [] return (batch_keys, batch_vals, lengths), backprop diff --git a/spacy/ml/models/defaults/textcat_bow_defaults.cfg b/spacy/ml/models/defaults/textcat_bow_defaults.cfg new file mode 100644 index 000000000..84472ea10 --- /dev/null +++ b/spacy/ml/models/defaults/textcat_bow_defaults.cfg @@ -0,0 +1,5 @@ +[model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size: 1 +no_output_layer: false diff --git a/spacy/ml/models/defaults/textcat_cnn_defaults.cfg b/spacy/ml/models/defaults/textcat_cnn_defaults.cfg new file mode 100644 index 000000000..cea1bfe54 --- /dev/null +++ b/spacy/ml/models/defaults/textcat_cnn_defaults.cfg @@ -0,0 +1,13 @@ +[model] +@architectures = "spacy.TextCatCNN.v1" +exclusive_classes = false + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/ml/models/defaults/textcat_defaults.cfg index cea1bfe54..9477b2995 100644 --- a/spacy/ml/models/defaults/textcat_defaults.cfg +++ b/spacy/ml/models/defaults/textcat_defaults.cfg @@ -1,13 +1,9 @@ [model] -@architectures = "spacy.TextCatCNN.v1" +@architectures = "spacy.TextCat.v1" exclusive_classes = false - -[model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = null -width = 96 -depth = 4 +width = 64 +conv_depth = 2 embed_size = 2000 window_size = 1 -maxout_pieces = 3 -subword_features = true +ngram_size = 1 diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index d2de10a0e..f2d51c2ba 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,7 +2,7 @@ from pydantic import StrictInt from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from ...util import registry -from .._layers import PrecomputableAffine +from .._precomputable_affine import PrecomputableAffine from ...syntax._parser_model import ParserModel diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 49679c8cd..ce31d058c 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,7 +1,11 @@ -from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic -from thinc.api import SparseLinear, Softmax +from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention +from thinc.api import chain, concatenate, clone, Dropout +from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window +from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor -from ...attrs import ORTH +from ..spacy_vectors import SpacyVectors +from ... import util +from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams @@ -20,7 +24,6 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model.set_ref("output_layer", output_layer) else: - # TODO: experiment with init_w=zero_init linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO")) model = ( tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() @@ -33,13 +36,100 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): @registry.architectures.register("spacy.TextCatBOW.v1") def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None): - # Note: original defaults were ngram_size=1 and no_output_layer=False with Model.define_operators({">>": chain}): - model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nO) - model.to_cpu() + sparse_linear = SparseLinear(nO) + model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear + model = with_cpu(model, model.ops) if not no_output_layer: - output_layer = Softmax(nO) if exclusive_classes else Logistic(nO) - output_layer.to_cpu() - model = model >> output_layer - model.set_ref("output_layer", output_layer) + output_layer = softmax_activation() if exclusive_classes else Logistic() + model = model >> with_cpu(output_layer, output_layer.ops) + model.set_ref("output_layer", sparse_linear) + return model + + +@registry.architectures.register("spacy.TextCat.v1") +def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, + window_size, conv_depth, nO=None): + cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER)) + prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX)) + suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX)) + shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE)) + + width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) + trained_vectors = FeatureExtractor(cols) >> with_array( + uniqued( + (lower | prefix | suffix | shape) + >> Maxout(nO=width, nI=width_nI, normalize=True), + column=cols.index(ORTH), + ) + ) + + if pretrained_vectors: + nlp = util.load_model(pretrained_vectors) + vectors = nlp.vocab.vectors + vector_dim = vectors.data.shape[1] + + static_vectors = SpacyVectors(vectors) >> with_array( + Linear(width, vector_dim) + ) + vector_layer = trained_vectors | static_vectors + vectors_width = width * 2 + else: + vector_layer = trained_vectors + vectors_width = width + tok2vec = vector_layer >> with_array( + Maxout(width, vectors_width, normalize=True) + >> residual((expand_window(window_size=window_size) + >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, + pad=conv_depth, + ) + cnn_model = ( + tok2vec + >> list2ragged() + >> ParametricAttention(width) + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) + ) + + linear_model = build_bow_text_classifier( + nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False + ) + nO_double = nO*2 if nO else None + if exclusive_classes: + output_layer = Softmax(nO=nO, nI=nO_double) + else: + output_layer = ( + Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() + ) + model = (linear_model | cnn_model) >> output_layer + model.set_ref("tok2vec", tok2vec) + if model.has_dim("nO") is not False: + model.set_dim("nO", nO) + model.set_ref("output_layer", linear_model.get_ref("output_layer")) + return model + + +@registry.architectures.register("spacy.TextCatLowData.v1") +def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): + nlp = util.load_model(pretrained_vectors) + vectors = nlp.vocab.vectors + vector_dim = vectors.data.shape[1] + + # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" + with Model.define_operators({">>": chain, "**": clone}): + model = ( + SpacyVectors(vectors) + >> list2ragged() + >> with_ragged(0, Linear(width, vector_dim)) + >> ParametricAttention(width) + >> reduce_sum() + >> residual(Relu(width, width)) ** 2 + >> Linear(nO, width) + >> Dropout(0.0) + >> Logistic() + ) return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index d1a98c080..81820e56b 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -28,8 +28,6 @@ def Tok2Vec(extract, embed, encode): if encode.attrs.get("receptive_field", None): field_size = encode.attrs["receptive_field"] with Model.define_operators({">>": chain, "|": concatenate}): - if extract.has_dim("nO"): - _set_dims(embed, "nI", extract.get_dim("nO")) tok2vec = extract >> with_array(embed >> encode, pad=field_size) tok2vec.set_dim("nO", encode.get_dim("nO")) tok2vec.set_ref("embed", embed) @@ -176,18 +174,11 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): nr_columns = 2 concat_columns = glove | norm - _set_dims(mix, "nI", width * nr_columns) embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) return embed_layer -def _set_dims(model, name, value): - # Loop through the model to set a specific dimension if its unset on any layer. - for node in model.walk(): - if node.has_dim(name) is None: - node.set_dim(name, value) - @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(columns, width, rows, nM, nC, features): norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) @@ -344,6 +335,7 @@ def build_Tok2Vec_model( tok2vec = tok2vec >> PyTorchLSTM( nO=width, nI=width, depth=bilstm_depth, bi=True ) - tok2vec.set_dim("nO", width) + if tok2vec.has_dim("nO") is not False: + tok2vec.set_dim("nO", width) tok2vec.set_ref("embed", embed) return tok2vec diff --git a/spacy/ml/spacy_vectors.py b/spacy/ml/spacy_vectors.py new file mode 100644 index 000000000..2a4988494 --- /dev/null +++ b/spacy/ml/spacy_vectors.py @@ -0,0 +1,27 @@ +import numpy +from thinc.api import Model, Unserializable + + +def SpacyVectors(vectors) -> Model: + attrs = {"vectors": Unserializable(vectors)} + model = Model("spacy_vectors", forward, attrs=attrs) + return model + + +def forward(model, docs, is_train: bool): + batch = [] + vectors = model.attrs["vectors"].obj + for doc in docs: + indices = numpy.zeros((len(doc),), dtype="i") + for i, word in enumerate(doc): + if word.orth in vectors.key2row: + indices[i] = vectors.key2row[word.orth] + else: + indices[i] = 0 + batch_vectors = vectors.data[indices] + batch.append(batch_vectors) + + def backprop(dY): + return None + + return batch, backprop diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 9ea2507cb..296ad5089 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -148,7 +148,8 @@ class Pipe(object): return sgd def set_output(self, nO): - self.model.set_dim("nO", nO) + if self.model.has_dim("nO") is not False: + self.model.set_dim("nO", nO) if self.model.has_ref("output_layer"): self.model.get_ref("output_layer").set_dim("nO", nO) @@ -1133,6 +1134,7 @@ class TextCategorizer(Pipe): docs = [Doc(Vocab(), words=["hello"])] truths, _ = self._examples_to_truth(examples) self.set_output(len(self.labels)) + link_vectors_to_models(self.vocab) self.model.initialize(X=docs, Y=truths) if sgd is None: sgd = self.create_optimizer() diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 4623f99b0..ef744a5da 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -131,10 +131,8 @@ class Tok2Vec(Pipe): get_examples (function): Function returning example training data. pipeline (list): The pipeline the model is part of. """ - # TODO: charembed does not play nicely with dim inference yet - # docs = [Doc(Vocab(), words=["hello"])] - # self.model.initialize(X=docs) - self.model.initialize() + docs = [Doc(Vocab(), words=["hello"])] + self.model.initialize(X=docs) link_vectors_to_models(self.vocab) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 1b5ca9a4c..38c980428 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -6,10 +6,12 @@ from spacy import util from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer -from spacy.tests.util import make_tempdir from spacy.tokens import Doc from spacy.gold import GoldParse +from ..util import make_tempdir +from ...ml.models.defaults import default_tok2vec + TRAIN_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), @@ -109,3 +111,33 @@ def test_overfitting_IO(): cats2 = doc2.cats assert cats2["POSITIVE"] > 0.9 assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) + + +# fmt: off +@pytest.mark.parametrize( + "textcat_config", + [ + {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}, + {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, + {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, + {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3}, + {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True}, + {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False}, + ], +) +# fmt: on +def test_textcat_configs(textcat_config): + pipe_config = {"model": textcat_config} + nlp = English() + textcat = nlp.create_pipe("textcat", pipe_config) + for _, annotations in TRAIN_DATA: + for label, value in annotations.get("cats").items(): + textcat.add_label(label) + nlp.add_pipe(textcat) + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 6d4e75a31..1200407d7 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,8 +4,7 @@ import ctypes from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.ml._layers import PrecomputableAffine -from spacy.ml._layers import _backprop_precomputable_affine_padding +from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding @pytest.fixture diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index e1ad1f0fc..9c2e9004b 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -4,18 +4,7 @@ from spacy.ml.models.tok2vec import build_Tok2Vec_model from spacy.vocab import Vocab from spacy.tokens import Doc - -def get_batch(batch_size): - vocab = Vocab() - docs = [] - start = 0 - for size in range(1, batch_size + 1): - # Make the words numbers, so that they're distinct - # across the batch, and easy to track. - numbers = [str(i) for i in range(start, start + size)] - docs.append(Doc(vocab, words=numbers)) - start += size - return docs +from .util import get_batch # This fails in Thinc v7.3.1. Need to push patch @@ -75,7 +64,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) tok2vec = build_Tok2Vec_model(**tok2vec_config) - tok2vec.initialize() + tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"]) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 958d51e11..e29342268 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -9,6 +9,8 @@ from spacy import Errors from spacy.tokens import Doc, Span from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA +from spacy.vocab import Vocab + @contextlib.contextmanager def make_tempfile(mode="r"): @@ -77,6 +79,19 @@ def get_doc( return doc +def get_batch(batch_size): + vocab = Vocab() + docs = [] + start = 0 + for size in range(1, batch_size + 1): + # Make the words numbers, so that they're distinct + # across the batch, and easy to track. + numbers = [str(i) for i in range(start, start + size)] + docs.append(Doc(vocab, words=numbers)) + start += size + return docs + + def apply_transition_sequence(parser, doc, sequence): """Perform a series of pre-specified transitions, to put the parser in a desired state.""" From ab59f3124eca47ada6955b7954c04df14d5f5b9f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 2 Apr 2020 10:32:52 +0200 Subject: [PATCH 093/187] fix NEL overfitting test for GPU (#5236) --- spacy/pipeline/pipes.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 296ad5089..1a0812442 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1456,7 +1456,7 @@ class EntityLinker(Pipe): scores = prior_probs + sims - (prior_probs*sims) # TODO: thresholding - best_index = scores.argmax() + best_index = scores.argmax().item() best_candidate = candidates[best_index] final_kb_ids.append(best_candidate.entity_) final_tensors.append(sentence_encoding) From b71a11ff6dd7b47582fbffd45121c05ff3b89977 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 2 Apr 2020 14:46:32 +0200 Subject: [PATCH 094/187] Update morphologizer (#5108) * Add pos and morph scoring to Scorer Add pos, morph, and morph_per_type to `Scorer`. Report pos and morph accuracy in `spacy evaluate`. * Update morphologizer for v3 * switch to tagger-based morphologizer * use `spacy.HashCharEmbedCNN` for morphologizer defaults * add `Doc.is_morphed` flag * Add morphologizer to train CLI * Add basic morphologizer pipeline tests * Add simple morphologizer training example * Remove subword_features from CharEmbed models Remove `subword_features` argument from `spacy.HashCharEmbedCNN.v1` and `spacy.HashCharEmbedBiLSTM.v1` since in these cases `subword_features` is always `False`. * Rename setting in morphologizer example Use `with_pos_tags` instead of `without_pos_tags`. * Fix kwargs for spacy.HashCharEmbedBiLSTM.v1 * Remove defaults for spacy.HashCharEmbedBiLSTM.v1 Remove default `nM/nC` for `spacy.HashCharEmbedBiLSTM.v1`. * Set random seed for textcat overfitting test --- examples/training/train_morphologizer.py | 133 ++++++++++ spacy/cli/evaluate.py | 4 +- spacy/cli/train.py | 14 +- .../defaults/morphologizer_defaults.cfg | 1 - spacy/ml/models/tok2vec.py | 7 +- spacy/pipeline/morphologizer.pyx | 237 +++++++++--------- spacy/scorer.py | 63 ++++- spacy/tests/pipeline/test_morphologizer.py | 49 ++++ spacy/tests/pipeline/test_textcat.py | 2 + spacy/tests/test_scorer.py | 75 ++++++ spacy/tokens/doc.pxd | 1 + 11 files changed, 458 insertions(+), 128 deletions(-) create mode 100644 examples/training/train_morphologizer.py create mode 100644 spacy/tests/pipeline/test_morphologizer.py diff --git a/examples/training/train_morphologizer.py b/examples/training/train_morphologizer.py new file mode 100644 index 000000000..aec114de7 --- /dev/null +++ b/examples/training/train_morphologizer.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# coding: utf8 +""" +A simple example for training a morphologizer. For more details, see +the documentation: +* Training: https://spacy.io/usage/training + +Compatible with: spaCy v3.0.0+ +Last tested with: v3.0.0 +""" +from __future__ import unicode_literals, print_function + +import plac +import random +from pathlib import Path +import spacy +from spacy.util import minibatch, compounding +from spacy.morphology import Morphology + + +# Usually you'll read this in, of course. Data formats vary. Ensure your +# strings are unicode and that the number of tags assigned matches spaCy's +# tokenization. If not, you can always add a 'words' key to the annotations +# that specifies the gold-standard tokenization, e.g.: +# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']}) +TRAIN_DATA = [ + ( + "I like green eggs", + { + "morphs": [ + "PronType=Prs|Person=1", + "VerbForm=Fin", + "Degree=Pos", + "Number=Plur", + ], + "pos": ["PRON", "VERB", "ADJ", "NOUN"], + }, + ), + ( + "Eat blue ham", + { + "morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"], + "pos": ["VERB", "ADJ", "NOUN"], + }, + ), + ( + "She was blue", + { + "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"], + "pos": ["PRON", "VERB", "ADJ"], + }, + ), + ( + "He was blue today", + { + "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""], + "pos": ["PRON", "VERB", "ADJ", "ADV"], + }, + ), +] + +# The POS tags are optional, set `with_pos_tags = False` to omit them for +# this example: +with_pos_tags = True + +if not with_pos_tags: + for i in range(len(TRAIN_DATA)): + del TRAIN_DATA[i][1]["pos"] + + +@plac.annotations( + lang=("ISO Code of language to use", "option", "l", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int), +) +def main(lang="en", output_dir=None, n_iter=25): + """Create a new model, set up the pipeline and train the tagger. In order to + train the tagger with a custom tag map, we're creating a new Language + instance with a custom vocab. + """ + nlp = spacy.blank(lang) + # add the tagger to the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy + morphologizer = nlp.create_pipe("morphologizer") + nlp.add_pipe(morphologizer) + + # add labels + for _, annotations in TRAIN_DATA: + morph_labels = annotations.get("morphs") + pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs"))) + assert len(morph_labels) == len(pos_labels) + for morph, pos in zip(morph_labels, pos_labels): + morph_dict = Morphology.feats_to_dict(morph) + if pos: + morph_dict["POS"] = pos + morph = Morphology.dict_to_feats(morph_dict) + morphologizer.add_label(morph) + + optimizer = nlp.begin_training() + for i in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + print("Losses", losses) + + # test the trained model + test_text = "I like blue eggs" + doc = nlp(test_text) + print("Morphs", [(t.text, t.morph) for t in doc]) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the save model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc = nlp2(test_text) + print("Morphs", [(t.text, t.morph) for t in doc]) + + +if __name__ == "__main__": + plac.call(main) + +# Expected output: +# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)] diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index e047f1283..94813e732 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -43,7 +43,9 @@ def evaluate( "Words": nwords, "Words/s": f"{nwords / (end - begin):.0f}", "TOK": f"{scorer.token_acc:.2f}", - "POS": f"{scorer.tags_acc:.2f}", + "TAG": f"{scorer.tags_acc:.2f}", + "POS": f"{scorer.pos_acc:.2f}", + "MORPH": f"{scorer.morphs_acc:.2f}", "UAS": f"{scorer.uas:.2f}", "LAS": f"{scorer.las:.2f}", "NER P": f"{scorer.ents_p:.2f}", diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 56020e4ff..5fa09da78 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -221,6 +221,8 @@ def train( config_loc = default_dir / "parser_defaults.cfg" elif pipe == "tagger": config_loc = default_dir / "tagger_defaults.cfg" + elif pipe == "morphologizer": + config_loc = default_dir / "morphologizer_defaults.cfg" elif pipe == "ner": config_loc = default_dir / "ner_defaults.cfg" elif pipe == "textcat": @@ -590,6 +592,8 @@ def _score_for_model(meta): acc = meta["accuracy"] if "tagger" in pipes: mean_acc.append(acc["tags_acc"]) + if "morphologizer" in pipes: + mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2) if "parser" in pipes: mean_acc.append((acc["uas"] + acc["las"]) / 2) if "ner" in pipes: @@ -672,13 +676,15 @@ def _find_best(experiment_dir, component): def _get_metrics(component): if component == "parser": - return ("las", "uas", "las_per_type", "token_acc", "sent_f") + return ("las", "uas", "las_per_type", "sent_f", "token_acc") elif component == "tagger": return ("tags_acc", "token_acc") + elif component == "morphologizer": + return ("morphs_acc", "pos_acc", "token_acc") elif component == "ner": return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc") elif component == "senter": - return ("sent_f", "sent_p", "sent_r") + return ("sent_f", "sent_p", "sent_r", "token_acc") elif component == "textcat": return ("textcat_score", "token_acc") return ("token_acc",) @@ -691,6 +697,9 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): if pipe == "tagger": row_head.extend(["Tag Loss ", " Tag % "]) output_stats.extend(["tag_loss", "tags_acc"]) + elif pipe == "morphologizer" or pipe == "morphologizertagger": + row_head.extend(["Morph Loss ", " Morph % ", " POS % "]) + output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"]) elif pipe == "parser": row_head.extend( ["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"] @@ -731,6 +740,7 @@ def _get_progress( scores["dep_loss"] = losses.get("parser", 0.0) scores["ner_loss"] = losses.get("ner", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0) + scores["morph_loss"] = losses.get("morphologizer", 0.0) scores["textcat_loss"] = losses.get("textcat", 0.0) scores["senter_loss"] = losses.get("senter", 0.0) scores["cpu_wps"] = cpu_wps diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/ml/models/defaults/morphologizer_defaults.cfg index 80e776c4f..150eca507 100644 --- a/spacy/ml/models/defaults/morphologizer_defaults.cfg +++ b/spacy/ml/models/defaults/morphologizer_defaults.cfg @@ -9,6 +9,5 @@ depth = 4 embed_size = 7000 window_size = 1 maxout_pieces = 3 -subword_features = true nM = 64 nC = 8 diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 81820e56b..a2e8f589a 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -74,7 +74,6 @@ def hash_charembed_cnn( embed_size, maxout_pieces, window_size, - subword_features, nM, nC, ): @@ -87,7 +86,7 @@ def hash_charembed_cnn( bilstm_depth=0, maxout_pieces=maxout_pieces, window_size=window_size, - subword_features=subword_features, + subword_features=False, char_embed=True, nM=nM, nC=nC, @@ -116,7 +115,7 @@ def hash_embed_bilstm_v1( @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces + pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -127,7 +126,7 @@ def hash_char_embed_bilstm_v1( conv_depth=0, maxout_pieces=maxout_pieces, window_size=1, - subword_features=subword_features, + subword_features=False, char_embed=True, nM=nM, nC=nC, diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index be9b166bf..7a2bc3b17 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,166 +1,169 @@ +# cython: infer_types=True, profile=True cimport numpy as np import numpy -from collections import defaultdict -from thinc.api import chain, list2array, to_categorical, get_array_module -from thinc.util import copy_array +import srsly +from thinc.api import to_categorical from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology +from ..parts_of_speech import IDS as POS_IDS +from ..symbols import POS from .. import util from ..language import component from ..util import link_vectors_to_models, create_default_optimizer from ..errors import Errors, TempErrors -from .pipes import Pipe +from .pipes import Tagger, _load_cfg +from .. import util @component("morphologizer", assigns=["token.morph", "token.pos"]) -class Morphologizer(Pipe): +class Morphologizer(Tagger): def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model + self._rehearsal_model = None self.cfg = dict(sorted(cfg.items())) - self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ? + self.cfg.setdefault("labels", {}) + self.cfg.setdefault("morph_pos", {}) @property def labels(self): - return self.vocab.morphology.tag_names + return tuple(self.cfg["labels"].keys()) - @property - def tok2vec(self): - if self.model in (None, True, False): - return None - else: - return chain(self.model.get_ref("tok2vec"), list2array()) - - def __call__(self, doc): - features, tokvecs = self.predict([doc]) - self.set_annotations([doc], features, tensors=tokvecs) - return doc - - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) - features, tokvecs = self.predict(docs) - self.set_annotations(docs, features, tensors=tokvecs) - yield from docs + def add_label(self, label): + if not isinstance(label, str): + raise ValueError(Errors.E187) + if label in self.labels: + return 0 + morph = Morphology.feats_to_dict(label) + norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] + pos = morph.get("POS", "") + if norm_morph_pos not in self.cfg["labels"]: + self.cfg["labels"][norm_morph_pos] = norm_morph_pos + self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] + return 1 def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): + for example in get_examples(): + for i, morph in enumerate(example.token_annotation.morphs): + pos = example.token_annotation.get_pos(i) + morph = Morphology.feats_to_dict(morph) + norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)] + if pos: + morph["POS"] = pos + norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] + if norm_morph_pos not in self.cfg["labels"]: + self.cfg["labels"][norm_morph_pos] = norm_morph + self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] self.set_output(len(self.labels)) self.model.initialize() + link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd - def predict(self, docs): - if not any(len(doc) for doc in docs): - # Handle case where there are no tokens in any docs. - n_labels = self.model.get_dim("nO") - guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] - tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO"))) - return guesses, tokvecs - tokvecs = self.model.get_ref("tok2vec")(docs) - scores = self.model.get_ref("softmax")(tokvecs) - return scores, tokvecs - - def set_annotations(self, docs, batch_scores, tensors=None): + def set_annotations(self, docs, batch_tag_ids): if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef Vocab vocab = self.vocab - offsets = [self._class_map.get_field_offset(field) - for field in self._class_map.fields] for i, doc in enumerate(docs): - doc_scores = batch_scores[i] - doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]) - # Convert the neuron indices into feature IDs. - doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i') - for j in range(len(doc)): - for k, offset in enumerate(offsets): - if doc_guesses[j, k] == 0: - doc_feat_ids[j, k] = 0 - else: - doc_feat_ids[j, k] = offset + doc_guesses[j, k] - # Get the set of feature names. - feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]} - if "NIL" in feats: - feats.remove("NIL") - # Now add the analysis, and set the hash. - doc.c[j].morph = self.vocab.morphology.add(feats) - if doc[j].morph.pos != 0: - doc.c[j].pos = doc[j].morph.pos + doc_tag_ids = batch_tag_ids[i] + if hasattr(doc_tag_ids, "get"): + doc_tag_ids = doc_tag_ids.get() + for j, tag_id in enumerate(doc_tag_ids): + morph = self.labels[tag_id] + doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph]) + doc.c[j].pos = self.cfg["morph_pos"][morph] - def update(self, examples, drop=0., sgd=None, losses=None): - if losses is not None and self.name not in losses: - losses[self.name] = 0. - - docs = [self._get_doc(ex) for ex in examples] - tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) - loss, d_tag_scores = self.get_loss(examples, tag_scores) - bp_tag_scores(d_tag_scores, sgd=sgd) - - if losses is not None: - losses[self.name] += loss + doc.is_morphed = True def get_loss(self, examples, scores): - guesses = [] - for doc_scores in scores: - guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])) - guesses = self.model.ops.xp.vstack(guesses) - scores = self.model.ops.xp.vstack(scores) - if not isinstance(scores, numpy.ndarray): - scores = scores.get() - if not isinstance(guesses, numpy.ndarray): - guesses = guesses.get() + scores = self.model.ops.flatten(scores) + tag_index = {tag: i for i, tag in enumerate(self.labels)} cdef int idx = 0 - # Do this on CPU, as we can't vectorize easily. - target = numpy.zeros(scores.shape, dtype='f') - field_sizes = self.model.get_ref("softmax").attrs["nOs"] - for example in examples: - doc = example.doc - gold = example.gold - for t, features in enumerate(gold.morphology): - if features is None: - target[idx] = scores[idx] + correct = numpy.zeros((scores.shape[0],), dtype="i") + guesses = scores.argmax(axis=1) + known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + for ex in examples: + gold = ex.gold + for i in range(len(gold.morphs)): + pos = gold.pos[i] if i < len(gold.pos) else "" + morph = gold.morphs[i] + feats = Morphology.feats_to_dict(morph) + if pos: + feats["POS"] = pos + if len(feats) > 0: + morph = self.vocab.strings[self.vocab.morphology.add(feats)] + if morph == "": + morph = Morphology.EMPTY_MORPH + if morph is None: + correct[idx] = guesses[idx] + elif morph in tag_index: + correct[idx] = tag_index[morph] else: - gold_fields = {} - for feature in features: - field = self._class_map.feat2field[feature] - gold_fields[field] = self._class_map.feat2offset[feature] - for field in self._class_map.fields: - field_id = self._class_map.field2id[field] - col_offset = self._class_map.field2col[field] - if field_id in gold_fields: - target[idx, col_offset + gold_fields[field_id]] = 1. - else: - target[idx, col_offset] = 1. - #print(doc[t]) - #for col, info in enumerate(self._class_map.col2info): - # print(col, info, scores[idx, col], target[idx, col]) + correct[idx] = 0 + known_labels[idx] = 0. idx += 1 - target = self.model.ops.asarray(target, dtype='f') - scores = self.model.ops.asarray(scores, dtype='f') - d_scores = scores - target + correct = self.model.ops.xp.array(correct, dtype="i") + d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) + d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [self._get_doc(ex) for ex in examples] + docs = [ex.doc for ex in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores - def use_params(self, params): - with self.model.use_params(params): - yield + def to_bytes(self, exclude=tuple(), **kwargs): + serialize = {} + serialize["model"] = self.model.to_bytes + serialize["vocab"] = self.vocab.to_bytes + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + return util.to_bytes(serialize, exclude) -def scores_to_guesses(scores, out_sizes): - xp = get_array_module(scores) - guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i') - offset = 0 - for i, size in enumerate(out_sizes): - slice_ = scores[:, offset : offset + size] - col_guesses = slice_.argmax(axis=1) - guesses[:, i] = col_guesses - offset += size - return guesses + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def load_model(b): + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg), + } + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude=tuple(), **kwargs): + def load_model(p): + with p.open("rb") as file_: + try: + self.model.from_bytes(file_.read()) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = { + "vocab": lambda p: self.vocab.from_disk(p), + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "model": load_model, + } + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_disk(path, deserialize, exclude) + return self diff --git a/spacy/scorer.py b/spacy/scorer.py index 82b10a77d..7e2466be7 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -81,6 +81,9 @@ class Scorer(object): self.labelled = PRFScore() self.labelled_per_dep = dict() self.tags = PRFScore() + self.pos = PRFScore() + self.morphs = PRFScore() + self.morphs_per_feat = dict() self.sent_starts = PRFScore() self.ner = PRFScore() self.ner_per_ents = dict() @@ -111,6 +114,29 @@ class Scorer(object): """ return self.tags.fscore * 100 + @property + def pos_acc(self): + """RETURNS (float): Part-of-speech tag accuracy (coarse grained pos, + i.e. `Token.pos`). + """ + return self.pos.fscore * 100 + + @property + def morphs_acc(self): + """RETURNS (float): Morph tag accuracy (morphological features, + i.e. `Token.morph`). + """ + return self.morphs.fscore * 100 + + @property + def morphs_per_type(self): + """RETURNS (dict): Scores per dependency label. + """ + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.morphs_per_feat.items() + } + @property def sent_p(self): """RETURNS (float): F-score for identification of sentence starts. @@ -231,6 +257,9 @@ class Scorer(object): "ents_f": self.ents_f, "ents_per_type": self.ents_per_type, "tags_acc": self.tags_acc, + "pos_acc": self.pos_acc, + "morphs_acc": self.morphs_acc, + "morphs_per_type": self.morphs_per_type, "sent_p": self.sent_p, "sent_r": self.sent_r, "sent_f": self.sent_f, @@ -264,12 +293,23 @@ class Scorer(object): gold_deps = set() gold_deps_per_dep = {} gold_tags = set() + gold_pos = set() + gold_morphs = set() + gold_morphs_per_feat = {} gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, head, dep, sent_start in zip( - orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts - ): + for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): gold_tags.add((id_, tag)) + gold_pos.add((id_, pos)) + gold_morphs.add((id_, morph)) + if morph: + for feat in morph.split("|"): + field, values = feat.split("=") + if field not in self.morphs_per_feat: + self.morphs_per_feat[field] = PRFScore() + if field not in gold_morphs_per_feat: + gold_morphs_per_feat[field] = set() + gold_morphs_per_feat[field].add((id_, feat)) if sent_start: gold_sent_starts.add(id_) if dep not in (None, "") and dep.lower() not in punct_labels: @@ -282,6 +322,9 @@ class Scorer(object): cand_deps = set() cand_deps_per_dep = {} cand_tags = set() + cand_pos = set() + cand_morphs = set() + cand_morphs_per_feat = {} cand_sent_starts = set() for token in doc: if token.orth_.isspace(): @@ -292,6 +335,16 @@ class Scorer(object): else: self.tokens.tp += 1 cand_tags.add((gold_i, token.tag_)) + cand_pos.add((gold_i, token.pos_)) + cand_morphs.add((gold_i, token.morph_)) + if token.morph_: + for feat in token.morph_.split("|"): + field, values = feat.split("=") + if field not in self.morphs_per_feat: + self.morphs_per_feat[field] = PRFScore() + if field not in cand_morphs_per_feat: + cand_morphs_per_feat[field] = set() + cand_morphs_per_feat[field].add((gold_i, feat)) if token.is_sent_start: cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): @@ -340,6 +393,10 @@ class Scorer(object): # Score for all ents self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) + self.pos.score_set(cand_pos, gold_pos) + self.morphs.score_set(cand_morphs, gold_morphs) + for field in self.morphs_per_feat: + self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py new file mode 100644 index 000000000..f9307afc2 --- /dev/null +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -0,0 +1,49 @@ +import pytest + +from spacy import util +from spacy.lang.en import English +from spacy.language import Language +from spacy.tests.util import make_tempdir + + +def test_label_types(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("morphologizer")) + nlp.get_pipe("morphologizer").add_label("Feat=A") + with pytest.raises(ValueError): + nlp.get_pipe("morphologizer").add_label(9) + + +TRAIN_DATA = [ + ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), + ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), +] + + +def test_overfitting_IO(): + # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly + nlp = English() + morphologizer = nlp.create_pipe("morphologizer") + for inst in TRAIN_DATA: + for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): + morphologizer.add_label(morph + "|POS=" + pos) + nlp.add_pipe(morphologizer) + optimizer = nlp.begin_training() + + for i in range(50): + losses = {} + nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + assert losses["morphologizer"] < 0.00001 + + # test the trained model + test_text = "I like blue eggs" + doc = nlp(test_text) + gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] + assert gold_morphs == [t.morph_ for t in doc] + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + assert gold_morphs == [t.morph_ for t in doc2] diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 38c980428..b091ec0de 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -8,6 +8,7 @@ from spacy.language import Language from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.gold import GoldParse +from spacy.util import fix_random_seed from ..util import make_tempdir from ...ml.models.defaults import default_tok2vec @@ -82,6 +83,7 @@ def test_label_types(): def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly + fix_random_seed(0) nlp = English() textcat = nlp.create_pipe("textcat") for _, annotations in TRAIN_DATA: diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index efaf80b4f..d750a8202 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -5,6 +5,7 @@ from spacy.gold import Example, GoldParse from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc +from spacy.lang.en import English test_las_apple = [ [ @@ -39,6 +40,43 @@ test_ner_apple = [ ] ] +@pytest.fixture +def tagged_doc(): + text = "Sarah's sister flew to Silicon Valley via London." + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = [ + "PROPN", + "PART", + "NOUN", + "VERB", + "ADP", + "PROPN", + "PROPN", + "ADP", + "PROPN", + "PUNCT", + ] + morphs = [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + "", + "NounType=prop|Number=sing", + "NounType=prop|Number=sing", + "", + "NounType=prop|Number=sing", + "PunctType=peri", + ] + nlp = English() + doc = nlp(text) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].pos_ = pos[i] + doc[i].morph_ = morphs[i] + doc.is_tagged = True + return doc + def test_las_per_type(en_vocab): # Gold and Doc are identical @@ -139,6 +177,43 @@ def test_ner_per_type(en_vocab): assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666) +def test_tag_score(tagged_doc): + # Gold and Doc are identical + scorer = Scorer() + gold = GoldParse( + tagged_doc, + tags=[t.tag_ for t in tagged_doc], + pos=[t.pos_ for t in tagged_doc], + morphs=[t.morph_ for t in tagged_doc] + ) + scorer.score((tagged_doc, gold)) + results = scorer.scores + + assert results["tags_acc"] == 100 + assert results["pos_acc"] == 100 + assert results["morphs_acc"] == 100 + assert results["morphs_per_type"]["NounType"]["f"] == 100 + + # Gold and Doc are identical + scorer = Scorer() + tags = [t.tag_ for t in tagged_doc] + tags[0] = "NN" + pos = [t.pos_ for t in tagged_doc] + pos[1] = "X" + morphs = [t.morph_ for t in tagged_doc] + morphs[1] = "Number=sing" + morphs[2] = "Number=plur" + gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs) + scorer.score((tagged_doc, gold)) + results = scorer.scores + + assert results["tags_acc"] == 90 + assert results["pos_acc"] == 90 + assert results["morphs_acc"] == approx(80) + assert results["morphs_per_type"]["Poss"]["f"] == 0.0 + assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272) + + def test_roc_auc_score(): # Binary classification, toy tests from scikit-learn test suite y_true = [0, 1] diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7f231887f..050a6b898 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -50,6 +50,7 @@ cdef class Doc: cdef public bint is_tagged cdef public bint is_parsed + cdef public bint is_morphed cdef public float sentiment From b2e93be867be16acee8ccc6f95e4fb1ebf7d86cf Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 3 Apr 2020 13:02:46 +0200 Subject: [PATCH 095/187] Optimizer defaults (#5244) * set optimizer defaults to mimic thinc 7 + bump to dev6 * larger error range for senter overfitting test --- spacy/about.py | 2 +- spacy/tests/pipeline/test_senter.py | 2 +- spacy/util.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 0c0a2d002..6fa1f4c0b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev4" +__version__ = "3.0.0.dev6" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 411768e5f..197fdca6e 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -33,7 +33,7 @@ def test_overfitting_IO(): for i in range(200): losses = {} nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) - assert losses["senter"] < 0.0001 + assert losses["senter"] < 0.001 # test the trained model test_text = "I like purple eggs. They eat ham. You like yellow eggs." diff --git a/spacy/util.py b/spacy/util.py index 37649c5e6..ef9082140 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -790,7 +790,8 @@ def create_default_optimizer(): beta2 = env_opt("optimizer_B2", 0.999) eps = env_opt("optimizer_eps", 1e-8) L2 = env_opt("L2_penalty", 1e-6) - grad_clip = env_opt("grad_norm_clip", 1.0) + grad_clip = env_opt("grad_norm_clip", 10.0) + L2_is_weight_decay = env_opt("L2_is_weight_decay", False) optimizer = Adam( learn_rate, L2=L2, @@ -799,5 +800,6 @@ def create_default_optimizer(): eps=eps, ops=ops, grad_clip=grad_clip, + L2_is_weight_decay=L2_is_weight_decay, ) return optimizer From 42364dcd9f7c243271416b068a7bc708f9ef6346 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 9 Apr 2020 10:21:20 +0200 Subject: [PATCH 096/187] Remove "pala" tokenizer exception for Spanish (#5265) --- spacy/lang/es/tokenizer_exceptions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 5c7fcb15d..d5eb42e29 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -3,7 +3,6 @@ from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA _exc = { "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], - "pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}], } From 688a3286689493d602db156edb0b768dc921eb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Harinck?= Date: Wed, 15 Apr 2020 16:47:29 +0200 Subject: [PATCH 097/187] docs(website): fix issue on example in spacy-lookup --- website/meta/universe.json | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 23d052bb9..70aace8c0 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -334,15 +334,16 @@ "from spacy_lookup import Entity", "", "nlp = spacy.load('en')", - "entity = Entity(keywords_list=['python', 'java platform'])", + "entity = Entity(keywords_list=['python', 'product manager', 'java platform'])", "nlp.add_pipe(entity, last=True)", "", "doc = nlp(u\"I am a product manager for a java and python.\")", "assert doc._.has_entities == True", - "assert doc[2:5]._.has_entities == True", "assert doc[0]._.is_entity == False", + "assert doc[3]._.entity_desc == 'product manager'", "assert doc[3]._.is_entity == True", - "print(doc._.entities)" + "", + "print([(token.text, token._.canonical) for token in doc if token._.is_entity])" ], "author": "Marc Puig", "author_links": { From 6918d99b6c631b5256aa24302050b085af841cc8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 20 Apr 2020 22:06:28 +0200 Subject: [PATCH 098/187] Improve GPU usage for train-with-config (#5330) * Adjust for no ops in Optimizer * Fix gpu in train-from-config * Update train-from-config script * Fix parser * Fix GPU efficiency of padding backprop --- spacy/cli/train_from_config.py | 37 ++++++++++++++++++------------- spacy/ml/_precomputable_affine.py | 19 +++++----------- spacy/syntax/nn_parser.pyx | 1 + spacy/util.py | 2 -- 4 files changed, 27 insertions(+), 32 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 5b09909c7..933b275c4 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,4 +1,5 @@ from typing import Optional, Dict, List, Union, Sequence +from timeit import default_timer as timer from pydantic import BaseModel, FilePath import plac import tqdm @@ -146,30 +147,29 @@ def train_from_config_cli( if output_path is not None and not output_path.exists(): output_path.mkdir() - try: - train_from_config( - config_path, - {"train": train_path, "dev": dev_path}, - output_path=output_path, - meta_path=meta_path, - raw_text=raw_text, - ) - except KeyboardInterrupt: - msg.warn("Cancelled.") + train_from_config( + config_path, + {"train": train_path, "dev": dev_path}, + output_path=output_path, + meta_path=meta_path, + raw_text=raw_text, + ) def train_from_config( config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): msg.info(f"Loading config from: {config_path}") - config = util.load_config(config_path, create_objects=True) + config = util.load_config(config_path, create_objects=False) + nlp_config = config["nlp"] use_gpu = config["training"]["use_gpu"] if use_gpu >= 0: msg.info("Using GPU") + util.use_gpu(use_gpu) else: msg.info("Using CPU") + config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") - nlp_config = util.load_config(config_path, create_objects=False)["nlp"] nlp = util.load_model_from_config(nlp_config) optimizer = config["optimizer"] training = config["training"] @@ -240,12 +240,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True ) ) + n_words = sum(len(ex.doc) for ex in dev_examples) + start_time = timer() scorer = nlp.evaluate(dev_examples) + end_time = timer() + wps = n_words / (end_time - start_time) scores = scorer.scores # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) - return weighted_score, scorer.scores + scores["speed"] = wps + return weighted_score, scores return evaluate @@ -346,13 +351,13 @@ def setup_printer(training, nlp): def print_row(info): losses = [ - "{0:.2f}".format(info["losses"].get(pipe_name, 0.0)) + "{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0))) for pipe_name in nlp.pipe_names ] scores = [ - "{0:.2f}".format(info["other_scores"].get(col, 0.0)) for col in score_cols + "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols ] - data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])] + data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] msg.row(data, widths=table_widths, aligns=table_aligns) return print_row diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index a752ef49a..c7328bad9 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -79,23 +79,14 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # for b in range(nB): # for f in range(nF): # if ids[b, f] < 0: - # d_pad[0, f] += dY[b] + # d_pad[f] += dY[b] # # Which can be rewritten as: # - # for b in range(nB): - # d_pad[0, ids[b] < 0] += dY[b] - # - # I don't know how to avoid the loop without building a whole array :(. - # Cursed numpy. - # - # Note by Sofie: rewritten to longer loop because "CuPy only supports slices that consist of one boolean array." - d_pad = model.ops.alloc((1, nF, nO, nP)) - for b in range(nB): - for f in range(nF): - if ids[b, f] < 0: - d_pad[0, f] += dY[b] - return d_pad + # (ids < 0).T @ dY + mask = model.ops.asarray(ids < 0, dtype="f") + d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) + return d_pad.reshape((1, nF, nO, nP)) def init(model, X=None, Y=None): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index f480e3528..01d6d5bfe 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -216,6 +216,7 @@ cdef class Parser: # expand our model output. self._resize() model = self.model.predict(docs) + W_param = model.vec2scores.get_param("W") weights = get_c_weights(model) for state in batch: if not state.is_final(): diff --git a/spacy/util.py b/spacy/util.py index ef9082140..ea3023629 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -784,7 +784,6 @@ VECTORS_KEY = "spacy_pretrained_vectors" def create_default_optimizer(): - ops = get_current_ops() learn_rate = env_opt("learn_rate", 0.001) beta1 = env_opt("optimizer_B1", 0.9) beta2 = env_opt("optimizer_B2", 0.999) @@ -798,7 +797,6 @@ def create_default_optimizer(): beta1=beta1, beta2=beta2, eps=eps, - ops=ops, grad_clip=grad_clip, L2_is_weight_decay=L2_is_weight_decay, ) From b2ef6100af585942388930a14fa78e9762758f36 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 21 Apr 2020 19:30:41 +0200 Subject: [PATCH 099/187] Only run backprop once when shared tok2vec weights (#5331) Previously, pipelines with shared tok2vec weights would call the tok2vec backprop callback multiple times, once for each pipeline component. This caused errors for PyTorch, and was inefficient. Instead, accumulate the gradient for all but one component, and just call the callback once. --- spacy/pipeline/tok2vec.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index ef744a5da..83a4454e3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -103,20 +103,30 @@ class Tok2Vec(Pipe): set_dropout_rate(self.model, drop) tokvecs, bp_tokvecs = self.model.begin_update(docs) - def capture_losses(d_tokvecs): - """Accumulate tok2vec loss before doing backprop.""" - l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs) - if self.name in losses: - losses[self.name] += l2_loss / len(d_tokvecs) - else: - losses[self.name] = l2_loss / len(d_tokvecs) - return bp_tokvecs(d_tokvecs) + d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + losses.setdefault(self.name, 0.0) + + def accumulate_gradient(one_d_tokvecs): + """Accumulate tok2vec loss and gradient. This is passed as a callback + to all but the last listener. Only the last one does the backprop. + """ + nonlocal d_tokvecs + for i in range(len(one_d_tokvecs)): + d_tokvecs[i] += one_d_tokvecs[i] + losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + + def backprop(one_d_tokvecs): + """Callback to actually do the backprop. Passed to last listener.""" + accumulate_gradient(one_d_tokvecs) + d_docs = bp_tokvecs(d_tokvecs) + if sgd is not None: + self.model.finish_update(sgd) + return d_docs batch_id = Tok2VecListener.get_batch_id(docs) - for listener in self.listeners: - listener.receive(batch_id, tokvecs, capture_losses) - if sgd is not None: - self.model.finish_update(sgd) + for listener in self.listeners[:-1]: + listener.receive(batch_id, tokvecs, accumulate_gradient) + self.listeners[-1].receive(batch_id, tokvecs, backprop) if set_annotations: self.set_annotations(docs, tokvecs) From 1bf2082ac48ab02300177c8a630e2fa5e74b7b7d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Apr 2020 12:51:49 +0200 Subject: [PATCH 100/187] update is_new_osx function (#5376) --- setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index d9021836f..d16615f5f 100755 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import sys +import platform from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc import distutils.util @@ -73,18 +74,18 @@ COPY_FILES = { def is_new_osx(): - """Check whether we're on OSX >= 10.10""" + """Check whether we're on OSX >= 10.7""" name = distutils.util.get_platform() if sys.platform != "darwin": return False - elif name.startswith("macosx-10"): - minor_version = int(name.split("-")[1].split(".")[1]) + mac_ver = platform.mac_ver()[0] + if mac_ver.startswith("10"): + minor_version = int(mac_ver.split('.')[1]) if minor_version >= 7: return True else: return False - else: - return False + return False if is_new_osx(): From eb117e2fce9d1029670f52690d30d17e6edbd24e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 2 May 2020 14:09:21 +0200 Subject: [PATCH 101/187] Add load_config_from_str helper --- spacy/util.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index ea3023629..b4ecc8b03 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -219,6 +219,23 @@ def load_config(path, create_objects=False): return config +def load_config_from_str(string, create_objects=False): + """Load a Thinc-formatted config, optionally filling in objects where + the config references registry entries. See "Thinc config files" for details. + + string (unicode or Path): Text contents of the config file. + create_objects (bool): Whether to automatically create objects when the config + references registry entries. Defaults to False. + + RETURNS (dict): The objects from the config file. + """ + config = thinc.config.Config().from_str(string) + if create_objects: + return registry.make_from_config(config, validate=True) + else: + return config + + def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. From 9fe1e235127f4b6e870a3180af879fefc4c33e90 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 12 May 2020 13:51:25 +0200 Subject: [PATCH 102/187] update to thinc 8.0.0a6 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9440c2d44..a7b4c825e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a3", + "thinc==8.0.0a6", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 73e595daf..814eaf3dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a3 +thinc==8.0.0a6 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 2ff13e3e1..80ceed207 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,13 +36,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a3 + thinc==8.0.0a6 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a3 + thinc==8.0.0a6 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 From 102c8c7e2f482b67d8fea8e4b9b341365da38565 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 12 May 2020 13:56:10 +0200 Subject: [PATCH 103/187] fix fan_in renaming --- spacy/ml/_precomputable_affine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index c7328bad9..ec95cdafd 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -110,7 +110,8 @@ def init(model, X=None, Y=None): pad = model.ops.alloc4f(1, nF, nO, nP) ops = model.ops - W = normal_init(ops, W.shape, fan_in=nF * nI) + scale = float(ops.xp.sqrt(1.0 / (nF * nI))) + W = normal_init(ops, W.shape, mean=scale) model.set_param("W", W) model.set_param("b", b) model.set_param("pad", pad) From e0fda2bd81bd7e7b1a9006c403d52c470954701b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 May 2020 11:02:10 +0200 Subject: [PATCH 104/187] throw warning when model_cfg is None --- spacy/errors.py | 2 ++ spacy/language.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 23139e10a..99a0081c0 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -104,6 +104,8 @@ class Warnings(object): "string \"Field1=Value1,Value2|Field2=Value3\".") # TODO: fix numbering after merging develop into master + W097 = ("No Model config was provided to create the '{name}' component, " + "and no default configuration could be found either.") W098 = ("No Model config was provided to create the '{name}' component, " "so a default configuration was used.") W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " diff --git a/spacy/language.py b/spacy/language.py index 5343df4b7..2dd7ce406 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -184,6 +184,7 @@ class Language(object): self.max_length = max_length self._optimizer = None + # TODO: de-uglify (incorporating into component decorator didn't work because of circular imports) from .ml.models.defaults import ( default_tagger_config, default_parser_config, @@ -349,6 +350,8 @@ class Language(object): if model_cfg is None and default_config is not None: warnings.warn(Warnings.W098.format(name=name)) model_cfg = default_config["model"] + if model_cfg is None: + warnings.warn(Warnings.W097.format(name=name)) model = None if model_cfg is not None: self.config[name] = {"model": model_cfg} From 79d4f196e54cce1b85bb6e741714e1a89ed4689c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 May 2020 11:53:01 +0200 Subject: [PATCH 105/187] pin flak8 to 3.5.0 --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f93dffaed..4dfb51296 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -27,7 +27,7 @@ jobs: inputs: versionSpec: '3.7' - script: | - pip install flake8 + pip install flake8==3.5.0 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: 'flake8' From 047f3d7d94a6ef9dec904a8a468497c9dcab7506 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 May 2020 13:25:00 +0200 Subject: [PATCH 106/187] remove ops argument for Adam --- spacy/tests/parser/test_add_label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fb43458ae..647c9720c 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -29,7 +29,7 @@ def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.begin_training([], **parser.cfg) - sgd = Adam(0.001, ops=NumpyOps()) + sgd = Adam(0.001) for i in range(5): losses = {} @@ -42,7 +42,7 @@ def _train_parser(parser): def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") - sgd = Adam(0.001, ops=NumpyOps()) + sgd = Adam(0.001) for i in range(100): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) From 6fb6a8518c014f10bb07aab386503b2ee6540ec4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 15 May 2020 13:25:54 +0200 Subject: [PATCH 107/187] bump to 3.0.0.dev7 and thinc to 8.0.0a8 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/about.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a7b4c825e..548664e89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a6", + "thinc==8.0.0a8", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 814eaf3dc..08b4c228a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a6 +thinc==8.0.0a8 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 80ceed207..9fe02018b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,13 +36,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a6 + thinc==8.0.0a8 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a6 + thinc==8.0.0a8 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 diff --git a/spacy/about.py b/spacy/about.py index 6fa1f4c0b..3f87c8dbc 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev6" +__version__ = "3.0.0.dev7" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From e8ff4c1e6a2b92eb0194d343a6f1f212172f4bb8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 18 May 2020 10:50:21 +0200 Subject: [PATCH 108/187] Pin flake8 version --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f93dffaed..4dfb51296 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -27,7 +27,7 @@ jobs: inputs: versionSpec: '3.7' - script: | - pip install flake8 + pip install flake8==3.5.0 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: 'flake8' From 333b1a308b8edd91e06ce914e49b10834e2de3ce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 18 May 2020 22:23:33 +0200 Subject: [PATCH 109/187] Adapt parser and NER for transformers (#5449) * Draft layer for BILUO actions * Fixes to biluo layer * WIP on BILUO layer * Add tests for BILUO layer * Format * Fix transitions * Update test * Link in the simple_ner * Update BILUO tagger * Update __init__ * Import simple_ner * Update test * Import * Add files * Add config * Fix label passing for BILUO and tagger * Fix label handling for simple_ner component * Update simple NER test * Update config * Hack train script * Update BILUO layer * Fix SimpleNER component * Update train_from_config * Add biluo_to_iob helper * Add IOB layer * Add IOBTagger model * Update biluo layer * Update SimpleNER tagger * Update BILUO * Read random seed in train-from-config * Update use of normal_init * Fix normalization of gradient in SimpleNER * Update IOBTagger * Remove print * Tweak masking in BILUO * Add dropout in SimpleNER * Update thinc * Tidy up simple_ner * Fix biluo model * Unhack train-from-config * Update setup.cfg and requirements * Add tb_framework.py for parser model * Try to avoid memory leak in BILUO * Move ParserModel into spacy.ml, avoid need for subclass. * Use updated parser model * Remove incorrect call to model.initializre in PrecomputableAffine * Update parser model * Avoid divide by zero in tagger * Add extra dropout layer in tagger * Refine minibatch_by_words function to avoid oom * Fix parser model after refactor * Try to avoid div-by-zero in SimpleNER * Fix infinite loop in minibatch_by_words * Use SequenceCategoricalCrossentropy in Tagger * Fix parser model when hidden layer * Remove extra dropout from tagger * Add extra nan check in tagger * Fix thinc version * Update tests and imports * Fix test * Update test * Update tests * Fix tests * Fix test Co-authored-by: Ines Montani --- .../tok2vec-ner/multihashembed_tok2vec.cfg | 57 +-- examples/training/train_ner.py | 12 +- spacy/cli/train_from_config.py | 81 ++-- spacy/gold.pyx | 8 + spacy/language.py | 2 + spacy/ml/_biluo.py | 109 +++++ spacy/ml/_iob.py | 92 ++++ spacy/ml/_precomputable_affine.py | 4 +- spacy/ml/models/__init__.py | 1 + spacy/ml/models/defaults/__init__.py | 10 + .../models/defaults/simple_ner_defaults.cfg | 12 + spacy/ml/models/parser.py | 30 +- spacy/ml/models/simple_ner.py | 82 ++++ spacy/ml/models/tagger.py | 5 +- spacy/ml/tb_framework.py | 86 ++++ spacy/pipeline/__init__.py | 2 + spacy/pipeline/pipes.pyx | 33 +- spacy/pipeline/simple_ner.py | 149 +++++++ spacy/syntax/_parser_model.pyx | 104 +---- spacy/syntax/nn_parser.pyx | 71 ++- spacy/tests/parser/test_add_label.py | 2 +- spacy/tests/parser/test_neural_parser.py | 6 +- spacy/tests/pipeline/test_simple_ner.py | 417 ++++++++++++++++++ spacy/tests/regression/test_issue2001-2500.py | 3 +- spacy/tests/regression/test_issue3001-3500.py | 3 +- .../tests/serialize/test_serialize_config.py | 6 +- .../serialize/test_serialize_pipeline.py | 2 +- spacy/tests/test_misc.py | 2 +- spacy/util.py | 36 +- 29 files changed, 1180 insertions(+), 247 deletions(-) create mode 100644 spacy/ml/_biluo.py create mode 100644 spacy/ml/_iob.py create mode 100644 spacy/ml/models/defaults/simple_ner_defaults.cfg create mode 100644 spacy/ml/models/simple_ner.py create mode 100644 spacy/ml/tb_framework.py create mode 100644 spacy/pipeline/simple_ner.py create mode 100644 spacy/tests/pipeline/test_simple_ner.py diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index 4678a7d6b..dc25a1c3b 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -4,12 +4,18 @@ limit = 0 dropout = 0.2 patience = 10000 eval_frequency = 200 -scores = ["ents_f"] +scores = ["ents_p", "ents_r", "ents_f"] score_weights = {"ents_f": 1} orth_variant_level = 0.0 gold_preproc = true max_length = 0 -batch_size = 25 + +[training.batch_size] +@schedules = "compounding.v1" +start = 3000 +stop = 3000 +compound = 1.001 + [optimizer] @optimizers = "Adam.v1" @@ -21,45 +27,18 @@ beta2 = 0.999 lang = "en" vectors = null -[nlp.pipeline.tok2vec] -factory = "tok2vec" - -[nlp.pipeline.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[nlp.pipeline.tok2vec.model.extract] -@architectures = "spacy.Doc2Feats.v1" -columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] - -[nlp.pipeline.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -columns = ${nlp.pipeline.tok2vec.model.extract:columns} -width = 96 -rows = 2000 -use_subwords = true -pretrained_vectors = null - -[nlp.pipeline.tok2vec.model.embed.mix] -@architectures = "spacy.LayerNormalizedMaxout.v1" -width = ${nlp.pipeline.tok2vec.model.embed:width} -maxout_pieces = 3 - -[nlp.pipeline.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = ${nlp.pipeline.tok2vec.model.embed:width} -window_size = 1 -maxout_pieces = 3 -depth = 2 - [nlp.pipeline.ner] -factory = "ner" +factory = "simple_ner" [nlp.pipeline.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 -hidden_width = 64 -maxout_pieces = 2 +@architectures = "spacy.BiluoTagger.v1" [nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model.embed:width} +@architectures = "spacy.HashEmbedCNN.v1" +width = 128 +depth = 4 +embed_size = 7000 +maxout_pieces = 3 +window_size = 1 +subword_features = true +pretrained_vectors = null diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index d5d034616..d4e0bf794 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -42,26 +42,28 @@ def main(model=None, output_dir=None, n_iter=100): # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy - if "ner" not in nlp.pipe_names: - ner = nlp.create_pipe("ner") + if "simple_ner" not in nlp.pipe_names: + ner = nlp.create_pipe("simple_ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: - ner = nlp.get_pipe("ner") + ner = nlp.get_pipe("simple_ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): + print("Add label", ent[2]) ner.add_label(ent[2]) # get names of other pipes to disable them during training - pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] + pipe_exceptions = ["simple_ner"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() + print("Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names()))) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} @@ -70,7 +72,7 @@ def main(model=None, output_dir=None, n_iter=100): for batch in batches: nlp.update( batch, - drop=0.5, # dropout - make it harder to memorise data + drop=0.0, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 933b275c4..bd83deb04 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -8,6 +8,7 @@ from wasabi import msg import thinc import thinc.schedules from thinc.api import Model +import random from ..gold import GoldCorpus from .. import util @@ -119,6 +120,7 @@ class ConfigSchema(BaseModel): output_path=("Output directory to store model in", "option", "o", Path), meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), + use_gpu=("Use GPU", "option", "g", int), # fmt: on ) def train_from_config_cli( @@ -130,6 +132,7 @@ def train_from_config_cli( raw_text=None, debug=False, verbose=False, + use_gpu=-1 ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -147,6 +150,12 @@ def train_from_config_cli( if output_path is not None and not output_path.exists(): output_path.mkdir() + if use_gpu >= 0: + msg.info("Using GPU") + util.use_gpu(use_gpu) + else: + msg.info("Using CPU") + train_from_config( config_path, {"train": train_path, "dev": dev_path}, @@ -161,13 +170,8 @@ def train_from_config( ): msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) + util.fix_random_seed(config["training"]["seed"]) nlp_config = config["nlp"] - use_gpu = config["training"]["use_gpu"] - if use_gpu >= 0: - msg.info("Using GPU") - util.use_gpu(use_gpu) - else: - msg.info("Using CPU") config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) @@ -177,7 +181,7 @@ def train_from_config( msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) msg.info("Initializing the nlp pipeline") - nlp.begin_training(lambda: corpus.train_examples, device=use_gpu) + nlp.begin_training(lambda: corpus.train_examples) train_batches = create_train_batches(nlp, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) @@ -192,6 +196,7 @@ def train_from_config( training["dropout"], training["patience"], training["eval_frequency"], + training["accumulate_gradient"] ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") @@ -220,43 +225,50 @@ def train_from_config( def create_train_batches(nlp, corpus, cfg): while True: - train_examples = corpus.train_dataset( + train_examples = list(corpus.train_dataset( nlp, noise_level=0.0, orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], ignore_misaligned=True, - ) - for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]): + )) + random.shuffle(train_examples) + batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) + for batch in batches: yield batch def create_evaluation_callback(nlp, optimizer, corpus, cfg): def evaluate(): - with nlp.use_params(optimizer.averages): - dev_examples = list( - corpus.dev_dataset( - nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True - ) + dev_examples = list( + corpus.dev_dataset( + nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True ) - n_words = sum(len(ex.doc) for ex in dev_examples) - start_time = timer() - scorer = nlp.evaluate(dev_examples) - end_time = timer() - wps = n_words / (end_time - start_time) - scores = scorer.scores - # Calculate a weighted sum based on score_weights for the main score - weights = cfg["score_weights"] - weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) - scores["speed"] = wps + ) + n_words = sum(len(ex.doc) for ex in dev_examples) + start_time = timer() + + if optimizer.averages: + with nlp.use_params(optimizer.averages): + scorer = nlp.evaluate(dev_examples, batch_size=32) + else: + scorer = nlp.evaluate(dev_examples, batch_size=32) + end_time = timer() + wps = n_words / (end_time - start_time) + scores = scorer.scores + # Calculate a weighted sum based on score_weights for the main score + weights = cfg["score_weights"] + weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + scores["speed"] = wps return weighted_score, scores return evaluate def train_while_improving( - nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency + nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency, + accumulate_gradient ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -303,7 +315,7 @@ def train_while_improving( losses = {} for step, batch in enumerate(train_data): dropout = next(dropouts) - for subbatch in subdivide_batch(batch): + for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) for name, proc in nlp.pipeline: if hasattr(proc, "model"): @@ -332,8 +344,19 @@ def train_while_improving( break -def subdivide_batch(batch): - return [batch] +def subdivide_batch(batch, accumulate_gradient): + batch = list(batch) + batch.sort(key=lambda eg: len(eg.doc)) + sub_len = len(batch) // accumulate_gradient + start = 0 + for i in range(accumulate_gradient): + subbatch = batch[start : start + sub_len] + if subbatch: + yield subbatch + start += len(subbatch) + subbatch = batch[start : ] + if subbatch: + yield subbatch def setup_printer(training, nlp): diff --git a/spacy/gold.pyx b/spacy/gold.pyx index a9156c1a5..6647e41b4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -608,6 +608,14 @@ def iob_to_biluo(tags): return out +def biluo_to_iob(tags): + out = [] + for tag in tags: + tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) + out.append(tag) + return out + + def _consume_os(tags): while tags and tags[0] == "O": yield tags.pop(0) diff --git a/spacy/language.py b/spacy/language.py index 2dd7ce406..a7db5ef20 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -195,6 +195,7 @@ class Language(object): default_senter_config, default_tensorizer_config, default_tok2vec_config, + default_simple_ner_config ) self.defaults = { @@ -205,6 +206,7 @@ class Language(object): "entity_linker": default_nel_config(), "morphologizer": default_morphologizer_config(), "senter": default_senter_config(), + "simple_ner": default_simple_ner_config(), "tensorizer": default_tensorizer_config(), "tok2vec": default_tok2vec_config(), } diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py new file mode 100644 index 000000000..28339089a --- /dev/null +++ b/spacy/ml/_biluo.py @@ -0,0 +1,109 @@ +"""Thinc layer to do simpler transition-based parsing, NER, etc.""" +from typing import List, Tuple, Dict, Optional +import numpy +from thinc.api import Ops, Model, with_array, softmax_activation, padded2list +from thinc.api import to_numpy +from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d + +from ..tokens import Doc + + +def BILUO() -> Model[Padded, Padded]: + return Model( + "biluo", + forward, + init=init, + dims={"nO": None}, + attrs={"get_num_actions": get_num_actions} + ) + + +def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): + if X is not None and Y is not None: + if X.data.shape != Y.data.shape: + # TODO: Fix error + raise ValueError("Mismatched shapes (TODO: Fix message)") + model.set_dim("nO", X.data.shape[2]) + elif X is not None: + model.set_dim("nO", X.data.shape[2]) + elif Y is not None: + model.set_dim("nO", Y.data.shape[2]) + elif model.get_dim("nO") is None: + raise ValueError("Dimension unset for BILUO: nO") + + +def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): + n_labels = (model.get_dim("nO") - 1) // 4 + n_tokens, n_docs, n_actions = Xp.data.shape + # At each timestep, we make a validity mask of shape (n_docs, n_actions) + # to indicate which actions are valid next for each sequence. To construct + # the mask, we have a state of shape (2, n_actions) and a validity table of + # shape (2, n_actions+1, n_actions). The first dimension of the state indicates + # whether it's the last token, the second dimension indicates the previous + # action, plus a special 'null action' for the first entry. + valid_transitions = model.ops.asarray(_get_transition_table(n_labels)) + prev_actions = model.ops.alloc1i(n_docs) + # Initialize as though prev action was O + prev_actions.fill(n_actions - 1) + Y = model.ops.alloc3f(*Xp.data.shape) + masks = model.ops.alloc3f(*Y.shape) + max_value = Xp.data.max() + for t in range(Xp.data.shape[0]): + is_last = (Xp.lengths < (t+2)).astype("i") + masks[t] = valid_transitions[is_last, prev_actions] + # Don't train the out-of-bounds sequences. + masks[t, Xp.size_at_t[t]:] = 0 + # Valid actions get 0*10e8, invalid get large negative value + Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) + prev_actions = Y[t].argmax(axis=-1) + + def backprop_biluo(dY: Padded) -> Padded: + dY.data *= masks + return dY + + return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo + + +def get_num_actions(n_labels: int) -> int: + # One BEGIN action per label + # One IN action per label + # One LAST action per label + # One UNIT action per label + # One OUT action + return n_labels + n_labels + n_labels + n_labels + 1 + + +def _get_transition_table( + n_labels: int, *, _cache: Dict[int, Floats3d] = {} +) -> Floats3d: + n_actions = get_num_actions(n_labels) + if n_actions in _cache: + return _cache[n_actions] + table = numpy.zeros((2, n_actions, n_actions), dtype="f") + B_start, B_end = (0, n_labels) + I_start, I_end = (B_end, B_end + n_labels) + L_start, L_end = (I_end, I_end + n_labels) + U_start, U_end = (L_end, L_end + n_labels) + # Using ranges allows us to set specific cells, which is necessary to express + # that only actions of the same label are valid continuations. + B_range = numpy.arange(B_start, B_end) + I_range = numpy.arange(I_start, I_end) + L_range = numpy.arange(L_start, L_end) + O_action = U_end + # If this is the last token and the previous action was B or I, only L + # of that label is valid + table[1, B_range, L_range] = 1 + table[1, I_range, L_range] = 1 + # If this isn't the last token and the previous action was B or I, only I or + # L of that label are valid. + table[0, B_range, I_range] = 1 + table[0, B_range, L_range] = 1 + table[0, I_range, I_range] = 1 + table[0, I_range, L_range] = 1 + # If this isn't the last token and the previous was L, U or O, B is valid + table[0, L_start:, :B_end] = 1 + # Regardless of whether this is the last token, if the previous action was + # {L, U, O}, U and O are valid. + table[:, L_start:, U_start:] = 1 + _cache[n_actions] = table + return table diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py new file mode 100644 index 000000000..0ce9a71e6 --- /dev/null +++ b/spacy/ml/_iob.py @@ -0,0 +1,92 @@ +"""Thinc layer to do simpler transition-based parsing, NER, etc.""" +from typing import List, Tuple, Dict, Optional +from thinc.api import Ops, Model, with_array, softmax_activation, padded2list +from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d + +from ..tokens import Doc + + +def IOB() -> Model[Padded, Padded]: + return Model( + "biluo", + forward, + init=init, + dims={"nO": None}, + attrs={"get_num_actions": get_num_actions} + ) + + +def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): + if X is not None and Y is not None: + if X.data.shape != Y.data.shape: + # TODO: Fix error + raise ValueError("Mismatched shapes (TODO: Fix message)") + model.set_dim("nO", X.data.shape[2]) + elif X is not None: + model.set_dim("nO", X.data.shape[2]) + elif Y is not None: + model.set_dim("nO", Y.data.shape[2]) + elif model.get_dim("nO") is None: + raise ValueError("Dimension unset for BILUO: nO") + + +def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): + n_labels = (model.get_dim("nO") - 1) // 2 + n_tokens, n_docs, n_actions = Xp.data.shape + # At each timestep, we make a validity mask of shape (n_docs, n_actions) + # to indicate which actions are valid next for each sequence. To construct + # the mask, we have a state of shape (2, n_actions) and a validity table of + # shape (2, n_actions+1, n_actions). The first dimension of the state indicates + # whether it's the last token, the second dimension indicates the previous + # action, plus a special 'null action' for the first entry. + valid_transitions = _get_transition_table(model.ops, n_labels) + prev_actions = model.ops.alloc1i(n_docs) + # Initialize as though prev action was O + prev_actions.fill(n_actions - 1) + Y = model.ops.alloc3f(*Xp.data.shape) + masks = model.ops.alloc3f(*Y.shape) + for t in range(Xp.data.shape[0]): + masks[t] = valid_transitions[prev_actions] + # Don't train the out-of-bounds sequences. + masks[t, Xp.size_at_t[t]:] = 0 + # Valid actions get 0*10e8, invalid get -1*10e8 + Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) + prev_actions = Y[t].argmax(axis=-1) + + def backprop_biluo(dY: Padded) -> Padded: + # Masking the gradient seems to do poorly here. But why? + #dY.data *= masks + return dY + + return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo + + +def get_num_actions(n_labels: int) -> int: + # One BEGIN action per label + # One IN action per label + # One LAST action per label + # One UNIT action per label + # One OUT action + return n_labels * 2 + 1 + + +def _get_transition_table( + ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {} +) -> Floats3d: + n_actions = get_num_actions(n_labels) + if n_actions in _cache: + return ops.asarray(_cache[n_actions]) + table = ops.alloc2f(n_actions, n_actions) + B_start, B_end = (0, n_labels) + I_start, I_end = (B_end, B_end + n_labels) + O_action = I_end + B_range = ops.xp.arange(B_start, B_end) + I_range = ops.xp.arange(I_start, I_end) + # B and O are always valid + table[:, B_start : B_end] = 1 + table[:, O_action] = 1 + # I can only follow a matching B + table[B_range, I_range] = 1 + + _cache[n_actions] = table + return table diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index ec95cdafd..f4b5b16fe 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -9,7 +9,6 @@ def PrecomputableAffine(nO, nI, nF, nP): dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, params={"W": None, "b": None, "pad": None}, ) - model.initialize() return model @@ -110,8 +109,7 @@ def init(model, X=None, Y=None): pad = model.ops.alloc4f(1, nF, nO, nP) ops = model.ops - scale = float(ops.xp.sqrt(1.0 / (nF * nI))) - W = normal_init(ops, W.shape, mean=scale) + W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) model.set_param("W", W) model.set_param("b", b) model.set_param("pad", pad) diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index d44c7cb2e..ef1e8efca 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,5 +1,6 @@ from .entity_linker import * # noqa from .parser import * # noqa +from .simple_ner import * from .tagger import * # noqa from .tensorizer import * # noqa from .textcat import * # noqa diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py index d5490fd16..850d9fce0 100644 --- a/spacy/ml/models/defaults/__init__.py +++ b/spacy/ml/models/defaults/__init__.py @@ -91,3 +91,13 @@ def default_tok2vec_config(): def default_tok2vec(): loc = Path(__file__).parent / "tok2vec_defaults.cfg" return util.load_config(loc, create_objects=True)["model"] + + +def default_simple_ner_config(): + loc = Path(__file__).parent / "simple_ner_defaults.cfg" + return util.load_config(loc, create_objects=False) + + +def default_simple_ner(): + loc = Path(__file__).parent / "simple_ner_defaults.cfg" + return util.load_config(loc, create_objects=True)["model"] diff --git a/spacy/ml/models/defaults/simple_ner_defaults.cfg b/spacy/ml/models/defaults/simple_ner_defaults.cfg new file mode 100644 index 000000000..4e3b640df --- /dev/null +++ b/spacy/ml/models/defaults/simple_ner_defaults.cfg @@ -0,0 +1,12 @@ +[model] +@architectures = "spacy.BiluoTagger.v1" + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 128 +depth = 4 +embed_size = 7000 +window_size = 1 +maxout_pieces = 3 +subword_features = true diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index f2d51c2ba..710d36a1d 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,9 +1,9 @@ from pydantic import StrictInt -from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops +from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array from ...util import registry from .._precomputable_affine import PrecomputableAffine -from ...syntax._parser_model import ParserModel +from ..tb_framework import TransitionModel @registry.architectures.register("spacy.TransitionBasedParser.v1") @@ -12,21 +12,27 @@ def build_tb_parser_model( nr_feature_tokens: StrictInt, hidden_width: StrictInt, maxout_pieces: StrictInt, + use_upper=True, nO=None, ): token_vector_width = tok2vec.get_dim("nO") - tok2vec = chain(tok2vec, list2array()) - tok2vec.set_dim("nO", token_vector_width) + tok2vec = chain( + tok2vec, + with_array(Linear(hidden_width, token_vector_width)), + list2array(), + ) + tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( - nO=hidden_width, + nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), - nP=maxout_pieces, + nP=maxout_pieces ) - lower.set_dim("nP", maxout_pieces) - with use_ops("numpy"): - # Initialize weights at zero, as it's a classification layer. - upper = Linear(nO=nO, init_W=zero_init) - model = ParserModel(tok2vec, lower, upper) - return model + if use_upper: + with use_ops("numpy"): + # Initialize weights at zero, as it's a classification layer. + upper = Linear(nO=nO, init_W=zero_init) + else: + upper = None + return TransitionModel(tok2vec, lower, upper) diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py new file mode 100644 index 000000000..01661f55b --- /dev/null +++ b/spacy/ml/models/simple_ner.py @@ -0,0 +1,82 @@ +import functools +from typing import List, Tuple, Dict, Optional +from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list +from thinc.api import chain, list2padded, configure_normal_init +from thinc.api import Dropout +from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d + +from ...tokens import Doc +from .._biluo import BILUO +from .._iob import IOB +from ...util import registry + + +@registry.architectures.register("spacy.BiluoTagger.v1") +def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: + biluo = BILUO() + linear = Linear( + nO=None, + nI=tok2vec.get_dim("nO"), + init_W=configure_normal_init(mean=0.02) + ) + model = chain( + tok2vec, + list2padded(), + with_array(chain(Dropout(0.1), linear)), + biluo, + with_array(softmax_activation()), + padded2list() + ) + + return Model( + "biluo-tagger", + forward, + init=init, + layers=[model, linear], + refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, + dims={"nO": None}, + attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + ) + +@registry.architectures.register("spacy.IOBTagger.v1") +def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: + biluo = IOB() + linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) + model = chain( + tok2vec, + list2padded(), + with_array(linear), + biluo, + with_array(softmax_activation()), + padded2list() + ) + + return Model( + "iob-tagger", + forward, + init=init, + layers=[model], + refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, + dims={"nO": None}, + attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + ) + + + +def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: + if model.get_dim("nO") is None and Y: + model.set_dim("nO", Y[0].shape[1]) + nO = model.get_dim("nO") + biluo = model.get_ref("biluo") + linear = model.get_ref("linear") + biluo.set_dim("nO", nO) + if linear.has_dim("nO") is None: + linear.set_dim("nO", nO) + model.layers[0].initialize(X=X, Y=Y) + + +def forward(model: Model, X: List[Doc], is_train: bool): + return model.layers[0](X, is_train) + + +__all__ = ["BiluoTagger"] diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index baca325bd..683c8b518 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,4 +1,5 @@ -from thinc.api import zero_init, with_array, Softmax, chain, Model +from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout +from thinc.api import glorot_uniform_init from ...util import registry @@ -11,6 +12,6 @@ def build_tagger_model(tok2vec, nO=None) -> Model: softmax = with_array(output_layer) model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) - model.set_ref("softmax", softmax) + model.set_ref("softmax", output_layer) model.set_ref("output_layer", output_layer) return model diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py new file mode 100644 index 000000000..e4301a644 --- /dev/null +++ b/spacy/ml/tb_framework.py @@ -0,0 +1,86 @@ +from thinc.api import Model, noop, use_ops, Linear +from ..syntax._parser_model import ParserStepModel + + +def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): + """Set up a stepwise transition-based model""" + if upper is None: + has_upper = False + upper = noop() + else: + has_upper = True + # don't define nO for this object, because we can't dynamically change it + return Model( + name="parser_model", + forward=forward, + dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None}, + layers=[tok2vec, lower, upper], + refs={"tok2vec": tok2vec, "lower": lower, "upper": upper}, + init=init, + attrs={ + "has_upper": has_upper, + "unseen_classes": set(unseen_classes), + "resize_output": resize_output + } + ) + + +def forward(model, X, is_train): + step_model = ParserStepModel( + X, + model.layers, + unseen_classes=model.attrs["unseen_classes"], + train=is_train, + has_upper=model.attrs["has_upper"] + ) + + return step_model, step_model.finish_steps + + +def init(model, X=None, Y=None): + tok2vec = model.get_ref("tok2vec").initialize() + lower = model.get_ref("lower").initialize(X=X) + if model.attrs["has_upper"]: + statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) + model.get_ref("upper").initialize(X=statevecs) + + +def resize_output(model, new_nO): + tok2vec = model.get_ref("tok2vec") + lower = model.get_ref("lower") + upper = model.get_ref("upper") + if not model.attrs["has_upper"]: + if lower.has_dim("nO") is None: + lower.set_dim("nO", new_nO) + return + elif upper.has_dim("nO") is None: + upper.set_dim("nO", new_nO) + return + elif new_nO == upper.get_dim("nO"): + return + smaller = upper + nI = None + if smaller.has_dim("nI"): + nI = smaller.get_dim("nI") + with use_ops('numpy'): + larger = Linear(nO=new_nO, nI=nI) + larger.init = smaller.init + # it could be that the model is not initialized yet, then skip this bit + if nI: + larger_W = larger.ops.alloc2f(new_nO, nI) + larger_b = larger.ops.alloc1f(new_nO) + smaller_W = smaller.get_param("W") + smaller_b = smaller.get_param("b") + # Weights are stored in (nr_out, nr_in) format, so we're basically + # just adding rows here. + if smaller.has_dim("nO"): + larger_W[:smaller.get_dim("nO")] = smaller_W + larger_b[:smaller.get_dim("nO")] = smaller_b + for i in range(smaller.get_dim("nO"), new_nO): + model.attrs["unseen_classes"].add(i) + + larger.set_param("W", larger_W) + larger.set_param("b", larger_b) + model._layers[-1] = larger + model.set_ref("upper", larger) + return model diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 6a90de81c..b2866bad2 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,6 +1,7 @@ from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer +from .simple_ner import SimpleNER from .morphologizer import Morphologizer from .entityruler import EntityRuler from .tok2vec import Tok2Vec @@ -22,6 +23,7 @@ __all__ = [ "SentenceSegmenter", "SentenceRecognizer", "SimilarityHook", + "SimpleNER", "merge_entities", "merge_noun_chunks", "merge_subtokens", diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 1a0812442..61db11baa 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,7 +3,7 @@ import numpy import srsly import random from thinc.api import CosineDistance, to_categorical, get_array_module -from thinc.api import set_dropout_rate +from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy import warnings from ..tokens.doc cimport Doc @@ -464,6 +464,9 @@ class Tagger(Pipe): return set_dropout_rate(self.model, drop) tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) + for sc in tag_scores: + if self.model.ops.xp.isnan(sc.sum()): + raise ValueError("nan value in scores") loss, d_tag_scores = self.get_loss(examples, tag_scores) bp_tag_scores(d_tag_scores) if sgd not in (None, False): @@ -497,29 +500,11 @@ class Tagger(Pipe): losses[self.name] += (gradient**2).sum() def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = {tag: i for i, tag in enumerate(self.labels)} - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for ex in examples: - gold = ex.gold - for tag in gold.tags: - if tag is None: - correct[idx] = guesses[idx] - elif tag in tag_index: - correct[idx] = tag_index[tag] - else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [ex.doc for ex in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + loss_func = SequenceCategoricalCrossentropy(names=self.labels) + truths = [eg.gold.tags for eg in examples] + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py new file mode 100644 index 000000000..8d53152d8 --- /dev/null +++ b/spacy/pipeline/simple_ner.py @@ -0,0 +1,149 @@ +from typing import List +from thinc.types import Floats2d +from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate +from thinc.util import to_numpy +from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob +from ..tokens import Doc +from ..language import component +from ..util import link_vectors_to_models +from .pipes import Pipe + + +@component("simple_ner", assigns=["doc.ents"]) +class SimpleNER(Pipe): + """Named entity recognition with a tagging model. The model should include + validity constraints to ensure that only valid tag sequences are returned.""" + + def __init__(self, vocab, model): + self.vocab = vocab + self.model = model + self.cfg = {"labels": []} + self.loss_func = SequenceCategoricalCrossentropy( + names=self.get_tag_names(), + normalize=True, + missing_value=None + ) + assert self.model is not None + + @property + def labels(self): + return self.cfg["labels"] + + @property + def is_biluo(self): + return self.model.name.startswith("biluo") + + def add_label(self, label): + if label not in self.cfg["labels"]: + self.cfg["labels"].append(label) + + def get_tag_names(self): + if self.is_biluo: + return ( + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels] + + ["O"] + ) + else: + return ( + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + ["O"] + ) + + def predict(self, docs: List[Doc]) -> List[Floats2d]: + scores = self.model.predict(docs) + return scores + + def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None): + """Set entities on a batch of documents from a batch of scores.""" + tag_names = self.get_tag_names() + for i, doc in enumerate(docs): + actions = to_numpy(scores[i].argmax(axis=1)) + tags = [tag_names[actions[j]] for j in range(len(doc))] + if not self.is_biluo: + tags = iob_to_biluo(tags) + doc.ents = spans_from_biluo_tags(doc, tags) + + def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): + if not any(_has_ner(eg) for eg in examples): + return 0 + examples = Example.to_example_objects(examples) + docs = [ex.doc for ex in examples] + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update(docs) + loss, d_scores = self.get_loss(examples, scores) + bp_scores(d_scores) + if set_annotations: + self.set_annotations(docs, scores) + if sgd is not None: + self.model.finish_update(sgd) + if losses is not None: + losses.setdefault("ner", 0.0) + losses["ner"] += loss + return loss + + def get_loss(self, examples, scores): + loss = 0 + d_scores = [] + truths = [] + for eg in examples: + gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner] + if not self.is_biluo: + gold_tags = biluo_to_iob(gold_tags) + truths.append(gold_tags) + for i in range(len(scores)): + if len(scores[i]) != len(truths[i]): + raise ValueError( + f"Mismatched output and gold sizes.\n" + f"Output: {len(scores[i])}, gold: {len(truths[i])}." + f"Input: {len(examples[i].doc)}" + ) + d_scores, loss = self.loss_func(scores, truths) + return loss, d_scores + + def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): + self.cfg.update(kwargs) + if not hasattr(get_examples, '__call__'): + gold_tuples = get_examples + get_examples = lambda: gold_tuples + labels = _get_labels(get_examples()) + for label in _get_labels(get_examples()): + self.add_label(label) + labels = self.labels + n_actions = self.model.attrs["get_num_actions"](len(labels)) + self.model.set_dim("nO", n_actions) + self.model.initialize() + if pipeline is not None: + self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) + link_vectors_to_models(self.vocab) + self.loss_func = SequenceCategoricalCrossentropy( + names=self.get_tag_names(), + normalize=True, + missing_value=None + ) + + return sgd + + def init_multitask_objectives(self, *args, **kwargs): + pass + + +def _has_ner(eg): + for ner_tag in eg.gold.ner: + if ner_tag != "-" and ner_tag != None: + return True + else: + return False + + +def _get_labels(examples): + labels = set() + for eg in examples: + for ner_tag in eg.token_annotation.entities: + if ner_tag != 'O' and ner_tag != '-': + _, label = ner_tag.split('-', 1) + labels.add(label) + return list(sorted(labels)) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 4f4e5e4b0..69f5bd6f6 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -12,7 +12,7 @@ cimport blis.cy import numpy import numpy.random -from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops +from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop from ..typedefs cimport weight_t, class_t, hash_t from ..tokens.doc cimport Doc @@ -219,112 +219,27 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no return best -class ParserModel(Model): - def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): - # don't define nO for this object, because we can't dynamically change it - Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None}) - if tok2vec.has_dim("nI"): - self.set_dim("nI", tok2vec.get_dim("nI")) - self._layers = [tok2vec, lower_model] - if upper_model is not None: - self._layers.append(upper_model) - self.unseen_classes = set() - if unseen_classes: - for class_ in unseen_classes: - self.unseen_classes.add(class_) - self.set_ref("tok2vec", tok2vec) - - def predict(self, docs): - step_model = ParserStepModel(docs, self._layers, - unseen_classes=self.unseen_classes, train=False) - return step_model - - def resize_output(self, new_nO): - if len(self._layers) == 2: - return - if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")): - return - smaller = self.upper - nI = None - if smaller.has_dim("nI"): - nI = smaller.get_dim("nI") - with use_ops('numpy'): - larger = Linear(nO=new_nO, nI=nI) - larger.init = smaller.init - # it could be that the model is not initialized yet, then skip this bit - if nI: - larger_W = larger.ops.alloc2f(new_nO, nI) - larger_b = larger.ops.alloc1f(new_nO) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - # Weights are stored in (nr_out, nr_in) format, so we're basically - # just adding rows here. - if smaller.has_dim("nO"): - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b - for i in range(smaller.get_dim("nO"), new_nO): - self.unseen_classes.add(i) - - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) - self._layers[-1] = larger - - def initialize(self, X=None, Y=None): - self.tok2vec.initialize() - self.lower.initialize(X=X, Y=Y) - if self.upper is not None: - # In case we need to trigger the callbacks - statevecs = self.ops.alloc((2, self.lower.get_dim("nO"))) - self.upper.initialize(X=statevecs) - - def finish_update(self, optimizer): - self.tok2vec.finish_update(optimizer) - self.lower.finish_update(optimizer) - if self.upper is not None: - self.upper.finish_update(optimizer) - - @property - def tok2vec(self): - return self._layers[0] - - @property - def lower(self): - return self._layers[1] - - @property - def upper(self): - return self._layers[2] - - -def forward(model:ParserModel, X, is_train): - step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes, - train=is_train) - - return step_model, step_model.finish_steps - class ParserStepModel(Model): - def __init__(self, docs, layers, unseen_classes=None, train=True): + def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True): Model.__init__(self, name="parser_step_model", forward=step_forward) + self.attrs["has_upper"] = has_upper self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) if layers[1].get_dim("nP") >= 2: activation = "maxout" - elif len(layers) == 2: + elif has_upper: activation = None else: activation = "relu" self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], activation=activation, train=train) - if len(layers) == 3: + if has_upper: self.vec2scores = layers[-1] else: self.vec2scores = None self.cuda_stream = util.get_cuda_stream(non_blocking=True) self.backprops = [] - if self.vec2scores is None: - self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f') - else: - self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f') + self._class_mask = numpy.zeros((self.nO,), dtype='f') self._class_mask.fill(1) if unseen_classes is not None: for class_ in unseen_classes: @@ -332,7 +247,10 @@ class ParserStepModel(Model): @property def nO(self): - return self.state2vec.nO + if self.attrs["has_upper"]: + return self.vec2scores.get_dim("nO") + else: + return self.state2vec.get_dim("nO") def class_is_unseen(self, class_): return self._class_mask[class_] @@ -378,7 +296,7 @@ class ParserStepModel(Model): def step_forward(model: ParserStepModel, states, is_train): token_ids = model.get_token_ids(states) vector, get_d_tokvecs = model.state2vec(token_ids, is_train) - if model.vec2scores is not None: + if model.attrs["has_upper"]: scores, get_d_vector = model.vec2scores(vector, is_train) else: scores = NumpyOps().asarray(vector) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 01d6d5bfe..31aa4d413 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -36,7 +36,6 @@ from ..util import link_vectors_to_models, create_default_optimizer, registry from ..compat import copy_array from ..errors import Errors, Warnings from .. import util -from ._parser_model import ParserModel from . import _beam_utils from . import nonproj @@ -69,7 +68,8 @@ cdef class Parser: cfg.setdefault('beam_width', 1) cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used) self.model = model - self.set_output(self.moves.n_moves) + if self.moves.n_moves != 0: + self.set_output(self.moves.n_moves) self.cfg = cfg self._multitasks = [] self._rehearsal_model = None @@ -105,7 +105,7 @@ cdef class Parser: @property def tok2vec(self): '''Return the embedding and convolutional layer of the model.''' - return self.model.tok2vec + return self.model.get_ref("tok2vec") @property def postprocesses(self): @@ -122,9 +122,11 @@ cdef class Parser: self._resize() def _resize(self): - self.model.resize_output(self.moves.n_moves) + self.model.attrs["resize_output"](self.model, self.moves.n_moves) if self._rehearsal_model not in (True, False, None): - self._rehearsal_model.resize_output(self.moves.n_moves) + self._rehearsal_model.attrs["resize_output"]( + self._rehearsal_model, self.moves.n_moves + ) def add_multitask_objective(self, target): # Defined in subclasses, to avoid circular import @@ -216,7 +218,6 @@ cdef class Parser: # expand our model output. self._resize() model = self.model.predict(docs) - W_param = model.vec2scores.get_param("W") weights = get_c_weights(model) for state in batch: if not state.is_final(): @@ -237,7 +238,7 @@ cdef class Parser: # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() - cdef int nr_feature = self.model.lower.get_dim("nF") + cdef int nr_feature = self.model.get_ref("lower").get_dim("nF") model = self.model.predict(docs) token_ids = numpy.zeros((len(docs) * beam_width, nr_feature), dtype='i', order='C') @@ -370,13 +371,16 @@ cdef class Parser: beam_density=self.cfg.get('beam_density', 0.001)) set_dropout_rate(self.model, drop) - # Chop sequences into lengths of this many transitions, to make the - # batch uniform length. - cut_gold = numpy.random.choice(range(20, 100)) - states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold) + cut_gold = True + if cut_gold: + # Chop sequences into lengths of this many transitions, to make the + # batch uniform length. + cut_gold = numpy.random.choice(range(20, 100)) + states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold) + else: + states, golds, max_steps = self._init_gold_batch_no_cut(examples) states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] - # Prepare the stepwise model, and get the callback for finishing the batch model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples]) all_states = list(states) @@ -456,9 +460,17 @@ cdef class Parser: set_dropout_rate(self.model, drop) model, backprop_tok2vec = self.model.begin_update(docs) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.model.lower.get_dim("nF"), 10000, states, golds, - model.state2vec, model.vec2scores, width, losses=losses, - beam_density=beam_density) + self.moves, + self.model.get_ref("lower").get_dim("nF"), + 10000, + states, + golds, + model.state2vec, + model.vec2scores, + width, + losses=losses, + beam_density=beam_density + ) for i, d_scores in enumerate(states_d_scores): losses[self.name] += (d_scores**2).mean() ids, bp_vectors, bp_scores = backprops[i] @@ -497,6 +509,24 @@ cdef class Parser: queue.extend(node._layers) return gradients + def _init_gold_batch_no_cut(self, whole_examples): + states = self.moves.init_batch([eg.doc for eg in whole_examples]) + good_docs = [] + good_golds = [] + good_states = [] + for i, eg in enumerate(whole_examples): + doc = eg.doc + gold = self.moves.preprocess_gold(eg.gold) + if gold is not None and self.moves.has_gold(gold): + good_docs.append(doc) + good_golds.append(gold) + good_states.append(states[i]) + n_moves = [] + for doc, gold in zip(good_docs, good_golds): + oracle_actions = self.moves.get_oracle_sequence(doc, gold) + n_moves.append(len(oracle_actions)) + return good_states, good_golds, max(n_moves, default=0) * 2 + def _init_gold_batch(self, whole_examples, min_length=5, max_length=500): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -550,16 +580,19 @@ cdef class Parser: cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), dtype='f', order='C') c_d_scores = d_scores.data + unseen_classes = self.model.attrs["unseen_classes"] for i, (state, gold) in enumerate(zip(states, golds)): memset(is_valid, 0, self.moves.n_moves * sizeof(int)) memset(costs, 0, self.moves.n_moves * sizeof(float)) self.moves.set_costs(is_valid, costs, state, gold) for j in range(self.moves.n_moves): - if costs[j] <= 0.0 and j in self.model.unseen_classes: - self.model.unseen_classes.remove(j) + if costs[j] <= 0.0 and j in unseen_classes: + unseen_classes.remove(j) cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) c_d_scores += d_scores.shape[1] + if len(states): + d_scores /= len(states) if losses is not None: losses.setdefault(self.name, 0.) losses[self.name] += (d_scores**2).sum() @@ -569,8 +602,7 @@ cdef class Parser: return create_default_optimizer() def set_output(self, nO): - if self.model.upper.has_dim("nO") is None: - self.model.upper.set_dim("nO", nO) + self.model.attrs["resize_output"](self.model, nO) def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) @@ -597,7 +629,6 @@ cdef class Parser: for doc, gold in parses: doc_sample.append(doc) gold_sample.append(gold) - self.model.initialize(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 647c9720c..39682ba3d 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -65,7 +65,7 @@ def test_add_label_deserializes_correctly(): ner2 = EntityRecognizer(Vocab(), default_ner()) # the second model needs to be resized before we can call from_bytes - ner2.model.resize_output(ner1.moves.n_moves) + ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) ner2.from_bytes(ner1.to_bytes()) assert ner1.moves.n_moves == ner2.moves.n_moves for i in range(ner1.moves.n_moves): diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 984af4d6b..c985cf87a 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -3,9 +3,9 @@ from spacy.ml.models.defaults import default_parser, default_tok2vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser -from spacy.syntax._parser_model import ParserModel from spacy.tokens.doc import Doc from spacy.gold import GoldParse +from thinc.api import Model @pytest.fixture @@ -34,7 +34,7 @@ def parser(vocab, arc_eager): @pytest.fixture def model(arc_eager, tok2vec, vocab): model = default_parser() - model.resize_output(arc_eager.n_moves) + model.attrs["resize_output"](model, arc_eager.n_moves) model.initialize() return model @@ -50,7 +50,7 @@ def gold(doc): def test_can_init_nn_parser(parser): - assert isinstance(parser.model, ParserModel) + assert isinstance(parser.model, Model) def test_build_model(parser, vocab): diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py new file mode 100644 index 000000000..9d4acf2fd --- /dev/null +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -0,0 +1,417 @@ +import pytest +from collections import namedtuple + +from thinc.api import NumpyOps +from spacy.ml._biluo import BILUO, _get_transition_table +from spacy.pipeline.simple_ner import SimpleNER +import spacy + + +@pytest.fixture(params=[ + ["PER", "ORG", "LOC", "MISC"], + ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] +]) +def labels(request): + return request.param + +@pytest.fixture +def ops(): + return NumpyOps() + +def _get_actions(labels): + action_names = ( + [f"B{label}" for label in labels] + \ + [f"I{label}" for label in labels] + \ + [f"L{label}" for label in labels] + \ + [f"U{label}" for label in labels] + \ + ["O"] + ) + A = namedtuple("actions", action_names) + return A(**{name: i for i, name in enumerate(action_names)}) + + +def test_init_biluo_layer(labels): + model = BILUO() + model.set_dim("nO", model.attrs["get_num_actions"](len(labels))) + model.initialize() + assert model.get_dim("nO") == len(labels) * 4 + 1 + + +def test_transition_table(ops): + labels = ["per", "loc", "org"] + table = _get_transition_table(len(labels)) + a = _get_actions(labels) + assert table.shape == (2, len(a), len(a)) + # Not last token, prev action was B + assert table[0, a.Bper, a.Bper] == 0 + assert table[0, a.Bper, a.Bloc] == 0 + assert table[0, a.Bper, a.Borg] == 0 + assert table[0, a.Bper, a.Iper] == 1 + assert table[0, a.Bper, a.Iloc] == 0 + assert table[0, a.Bper, a.Iorg] == 0 + assert table[0, a.Bper, a.Lper] == 1 + assert table[0, a.Bper, a.Lloc] == 0 + assert table[0, a.Bper, a.Lorg] == 0 + assert table[0, a.Bper, a.Uper] == 0 + assert table[0, a.Bper, a.Uloc] == 0 + assert table[0, a.Bper, a.Uorg] == 0 + assert table[0, a.Bper, a.O] == 0 + + assert table[0, a.Bloc, a.Bper] == 0 + assert table[0, a.Bloc, a.Bloc] == 0 + assert table[0, a.Bloc, a.Borg] == 0 + assert table[0, a.Bloc, a.Iper] == 0 + assert table[0, a.Bloc, a.Iloc] == 1 + assert table[0, a.Bloc, a.Iorg] == 0 + assert table[0, a.Bloc, a.Lper] == 0 + assert table[0, a.Bloc, a.Lloc] == 1 + assert table[0, a.Bloc, a.Lorg] == 0 + assert table[0, a.Bloc, a.Uper] == 0 + assert table[0, a.Bloc, a.Uloc] == 0 + assert table[0, a.Bloc, a.Uorg] == 0 + assert table[0, a.Bloc, a.O] == 0 + + assert table[0, a.Borg, a.Bper] == 0 + assert table[0, a.Borg, a.Bloc] == 0 + assert table[0, a.Borg, a.Borg] == 0 + assert table[0, a.Borg, a.Iper] == 0 + assert table[0, a.Borg, a.Iloc] == 0 + assert table[0, a.Borg, a.Iorg] == 1 + assert table[0, a.Borg, a.Lper] == 0 + assert table[0, a.Borg, a.Lloc] == 0 + assert table[0, a.Borg, a.Lorg] == 1 + assert table[0, a.Borg, a.Uper] == 0 + assert table[0, a.Borg, a.Uloc] == 0 + assert table[0, a.Borg, a.Uorg] == 0 + assert table[0, a.Borg, a.O] == 0 + + # Not last token, prev action was I + assert table[0, a.Iper, a.Bper] == 0 + assert table[0, a.Iper, a.Bloc] == 0 + assert table[0, a.Iper, a.Borg] == 0 + assert table[0, a.Iper, a.Iper] == 1 + assert table[0, a.Iper, a.Iloc] == 0 + assert table[0, a.Iper, a.Iorg] == 0 + assert table[0, a.Iper, a.Lper] == 1 + assert table[0, a.Iper, a.Lloc] == 0 + assert table[0, a.Iper, a.Lorg] == 0 + assert table[0, a.Iper, a.Uper] == 0 + assert table[0, a.Iper, a.Uloc] == 0 + assert table[0, a.Iper, a.Uorg] == 0 + assert table[0, a.Iper, a.O] == 0 + + assert table[0, a.Iloc, a.Bper] == 0 + assert table[0, a.Iloc, a.Bloc] == 0 + assert table[0, a.Iloc, a.Borg] == 0 + assert table[0, a.Iloc, a.Iper] == 0 + assert table[0, a.Iloc, a.Iloc] == 1 + assert table[0, a.Iloc, a.Iorg] == 0 + assert table[0, a.Iloc, a.Lper] == 0 + assert table[0, a.Iloc, a.Lloc] == 1 + assert table[0, a.Iloc, a.Lorg] == 0 + assert table[0, a.Iloc, a.Uper] == 0 + assert table[0, a.Iloc, a.Uloc] == 0 + assert table[0, a.Iloc, a.Uorg] == 0 + assert table[0, a.Iloc, a.O] == 0 + + assert table[0, a.Iorg, a.Bper] == 0 + assert table[0, a.Iorg, a.Bloc] == 0 + assert table[0, a.Iorg, a.Borg] == 0 + assert table[0, a.Iorg, a.Iper] == 0 + assert table[0, a.Iorg, a.Iloc] == 0 + assert table[0, a.Iorg, a.Iorg] == 1 + assert table[0, a.Iorg, a.Lper] == 0 + assert table[0, a.Iorg, a.Lloc] == 0 + assert table[0, a.Iorg, a.Lorg] == 1 + assert table[0, a.Iorg, a.Uper] == 0 + assert table[0, a.Iorg, a.Uloc] == 0 + assert table[0, a.Iorg, a.Uorg] == 0 + assert table[0, a.Iorg, a.O] == 0 + + # Not last token, prev action was L + assert table[0, a.Lper, a.Bper] == 1 + assert table[0, a.Lper, a.Bloc] == 1 + assert table[0, a.Lper, a.Borg] == 1 + assert table[0, a.Lper, a.Iper] == 0 + assert table[0, a.Lper, a.Iloc] == 0 + assert table[0, a.Lper, a.Iorg] == 0 + assert table[0, a.Lper, a.Lper] == 0 + assert table[0, a.Lper, a.Lloc] == 0 + assert table[0, a.Lper, a.Lorg] == 0 + assert table[0, a.Lper, a.Uper] == 1 + assert table[0, a.Lper, a.Uloc] == 1 + assert table[0, a.Lper, a.Uorg] == 1 + assert table[0, a.Lper, a.O] == 1 + + assert table[0, a.Lloc, a.Bper] == 1 + assert table[0, a.Lloc, a.Bloc] == 1 + assert table[0, a.Lloc, a.Borg] == 1 + assert table[0, a.Lloc, a.Iper] == 0 + assert table[0, a.Lloc, a.Iloc] == 0 + assert table[0, a.Lloc, a.Iorg] == 0 + assert table[0, a.Lloc, a.Lper] == 0 + assert table[0, a.Lloc, a.Lloc] == 0 + assert table[0, a.Lloc, a.Lorg] == 0 + assert table[0, a.Lloc, a.Uper] == 1 + assert table[0, a.Lloc, a.Uloc] == 1 + assert table[0, a.Lloc, a.Uorg] == 1 + assert table[0, a.Lloc, a.O] == 1 + + assert table[0, a.Lorg, a.Bper] == 1 + assert table[0, a.Lorg, a.Bloc] == 1 + assert table[0, a.Lorg, a.Borg] == 1 + assert table[0, a.Lorg, a.Iper] == 0 + assert table[0, a.Lorg, a.Iloc] == 0 + assert table[0, a.Lorg, a.Iorg] == 0 + assert table[0, a.Lorg, a.Lper] == 0 + assert table[0, a.Lorg, a.Lloc] == 0 + assert table[0, a.Lorg, a.Lorg] == 0 + assert table[0, a.Lorg, a.Uper] == 1 + assert table[0, a.Lorg, a.Uloc] == 1 + assert table[0, a.Lorg, a.Uorg] == 1 + assert table[0, a.Lorg, a.O] == 1 + + # Not last token, prev action was U + assert table[0, a.Uper, a.Bper] == 1 + assert table[0, a.Uper, a.Bloc] == 1 + assert table[0, a.Uper, a.Borg] == 1 + assert table[0, a.Uper, a.Iper] == 0 + assert table[0, a.Uper, a.Iloc] == 0 + assert table[0, a.Uper, a.Iorg] == 0 + assert table[0, a.Uper, a.Lper] == 0 + assert table[0, a.Uper, a.Lloc] == 0 + assert table[0, a.Uper, a.Lorg] == 0 + assert table[0, a.Uper, a.Uper] == 1 + assert table[0, a.Uper, a.Uloc] == 1 + assert table[0, a.Uper, a.Uorg] == 1 + assert table[0, a.Uper, a.O] == 1 + + assert table[0, a.Uloc, a.Bper] == 1 + assert table[0, a.Uloc, a.Bloc] == 1 + assert table[0, a.Uloc, a.Borg] == 1 + assert table[0, a.Uloc, a.Iper] == 0 + assert table[0, a.Uloc, a.Iloc] == 0 + assert table[0, a.Uloc, a.Iorg] == 0 + assert table[0, a.Uloc, a.Lper] == 0 + assert table[0, a.Uloc, a.Lloc] == 0 + assert table[0, a.Uloc, a.Lorg] == 0 + assert table[0, a.Uloc, a.Uper] == 1 + assert table[0, a.Uloc, a.Uloc] == 1 + assert table[0, a.Uloc, a.Uorg] == 1 + assert table[0, a.Uloc, a.O] == 1 + + assert table[0, a.Uorg, a.Bper] == 1 + assert table[0, a.Uorg, a.Bloc] == 1 + assert table[0, a.Uorg, a.Borg] == 1 + assert table[0, a.Uorg, a.Iper] == 0 + assert table[0, a.Uorg, a.Iloc] == 0 + assert table[0, a.Uorg, a.Iorg] == 0 + assert table[0, a.Uorg, a.Lper] == 0 + assert table[0, a.Uorg, a.Lloc] == 0 + assert table[0, a.Uorg, a.Lorg] == 0 + assert table[0, a.Uorg, a.Uper] == 1 + assert table[0, a.Uorg, a.Uloc] == 1 + assert table[0, a.Uorg, a.Uorg] == 1 + assert table[0, a.Uorg, a.O] == 1 + + # Not last token, prev action was O + assert table[0, a.O, a.Bper] == 1 + assert table[0, a.O, a.Bloc] == 1 + assert table[0, a.O, a.Borg] == 1 + assert table[0, a.O, a.Iper] == 0 + assert table[0, a.O, a.Iloc] == 0 + assert table[0, a.O, a.Iorg] == 0 + assert table[0, a.O, a.Lper] == 0 + assert table[0, a.O, a.Lloc] == 0 + assert table[0, a.O, a.Lorg] == 0 + assert table[0, a.O, a.Uper] == 1 + assert table[0, a.O, a.Uloc] == 1 + assert table[0, a.O, a.Uorg] == 1 + assert table[0, a.O, a.O] == 1 + + # Last token, prev action was B + assert table[1, a.Bper, a.Bper] == 0 + assert table[1, a.Bper, a.Bloc] == 0 + assert table[1, a.Bper, a.Borg] == 0 + assert table[1, a.Bper, a.Iper] == 0 + assert table[1, a.Bper, a.Iloc] == 0 + assert table[1, a.Bper, a.Iorg] == 0 + assert table[1, a.Bper, a.Lper] == 1 + assert table[1, a.Bper, a.Lloc] == 0 + assert table[1, a.Bper, a.Lorg] == 0 + assert table[1, a.Bper, a.Uper] == 0 + assert table[1, a.Bper, a.Uloc] == 0 + assert table[1, a.Bper, a.Uorg] == 0 + assert table[1, a.Bper, a.O] == 0 + + assert table[1, a.Bloc, a.Bper] == 0 + assert table[1, a.Bloc, a.Bloc] == 0 + assert table[0, a.Bloc, a.Borg] == 0 + assert table[1, a.Bloc, a.Iper] == 0 + assert table[1, a.Bloc, a.Iloc] == 0 + assert table[1, a.Bloc, a.Iorg] == 0 + assert table[1, a.Bloc, a.Lper] == 0 + assert table[1, a.Bloc, a.Lloc] == 1 + assert table[1, a.Bloc, a.Lorg] == 0 + assert table[1, a.Bloc, a.Uper] == 0 + assert table[1, a.Bloc, a.Uloc] == 0 + assert table[1, a.Bloc, a.Uorg] == 0 + assert table[1, a.Bloc, a.O] == 0 + + assert table[1, a.Borg, a.Bper] == 0 + assert table[1, a.Borg, a.Bloc] == 0 + assert table[1, a.Borg, a.Borg] == 0 + assert table[1, a.Borg, a.Iper] == 0 + assert table[1, a.Borg, a.Iloc] == 0 + assert table[1, a.Borg, a.Iorg] == 0 + assert table[1, a.Borg, a.Lper] == 0 + assert table[1, a.Borg, a.Lloc] == 0 + assert table[1, a.Borg, a.Lorg] == 1 + assert table[1, a.Borg, a.Uper] == 0 + assert table[1, a.Borg, a.Uloc] == 0 + assert table[1, a.Borg, a.Uorg] == 0 + assert table[1, a.Borg, a.O] == 0 + + # Last token, prev action was I + assert table[1, a.Iper, a.Bper] == 0 + assert table[1, a.Iper, a.Bloc] == 0 + assert table[1, a.Iper, a.Borg] == 0 + assert table[1, a.Iper, a.Iper] == 0 + assert table[1, a.Iper, a.Iloc] == 0 + assert table[1, a.Iper, a.Iorg] == 0 + assert table[1, a.Iper, a.Lper] == 1 + assert table[1, a.Iper, a.Lloc] == 0 + assert table[1, a.Iper, a.Lorg] == 0 + assert table[1, a.Iper, a.Uper] == 0 + assert table[1, a.Iper, a.Uloc] == 0 + assert table[1, a.Iper, a.Uorg] == 0 + assert table[1, a.Iper, a.O] == 0 + + assert table[1, a.Iloc, a.Bper] == 0 + assert table[1, a.Iloc, a.Bloc] == 0 + assert table[1, a.Iloc, a.Borg] == 0 + assert table[1, a.Iloc, a.Iper] == 0 + assert table[1, a.Iloc, a.Iloc] == 0 + assert table[1, a.Iloc, a.Iorg] == 0 + assert table[1, a.Iloc, a.Lper] == 0 + assert table[1, a.Iloc, a.Lloc] == 1 + assert table[1, a.Iloc, a.Lorg] == 0 + assert table[1, a.Iloc, a.Uper] == 0 + assert table[1, a.Iloc, a.Uloc] == 0 + assert table[1, a.Iloc, a.Uorg] == 0 + assert table[1, a.Iloc, a.O] == 0 + + assert table[1, a.Iorg, a.Bper] == 0 + assert table[1, a.Iorg, a.Bloc] == 0 + assert table[1, a.Iorg, a.Borg] == 0 + assert table[1, a.Iorg, a.Iper] == 0 + assert table[1, a.Iorg, a.Iloc] == 0 + assert table[1, a.Iorg, a.Iorg] == 0 + assert table[1, a.Iorg, a.Lper] == 0 + assert table[1, a.Iorg, a.Lloc] == 0 + assert table[1, a.Iorg, a.Lorg] == 1 + assert table[1, a.Iorg, a.Uper] == 0 + assert table[1, a.Iorg, a.Uloc] == 0 + assert table[1, a.Iorg, a.Uorg] == 0 + assert table[1, a.Iorg, a.O] == 0 + + # Last token, prev action was L + assert table[1, a.Lper, a.Bper] == 0 + assert table[1, a.Lper, a.Bloc] == 0 + assert table[1, a.Lper, a.Borg] == 0 + assert table[1, a.Lper, a.Iper] == 0 + assert table[1, a.Lper, a.Iloc] == 0 + assert table[1, a.Lper, a.Iorg] == 0 + assert table[1, a.Lper, a.Lper] == 0 + assert table[1, a.Lper, a.Lloc] == 0 + assert table[1, a.Lper, a.Lorg] == 0 + assert table[1, a.Lper, a.Uper] == 1 + assert table[1, a.Lper, a.Uloc] == 1 + assert table[1, a.Lper, a.Uorg] == 1 + assert table[1, a.Lper, a.O] == 1 + + assert table[1, a.Lloc, a.Bper] == 0 + assert table[1, a.Lloc, a.Bloc] == 0 + assert table[1, a.Lloc, a.Borg] == 0 + assert table[1, a.Lloc, a.Iper] == 0 + assert table[1, a.Lloc, a.Iloc] == 0 + assert table[1, a.Lloc, a.Iorg] == 0 + assert table[1, a.Lloc, a.Lper] == 0 + assert table[1, a.Lloc, a.Lloc] == 0 + assert table[1, a.Lloc, a.Lorg] == 0 + assert table[1, a.Lloc, a.Uper] == 1 + assert table[1, a.Lloc, a.Uloc] == 1 + assert table[1, a.Lloc, a.Uorg] == 1 + assert table[1, a.Lloc, a.O] == 1 + + assert table[1, a.Lorg, a.Bper] == 0 + assert table[1, a.Lorg, a.Bloc] == 0 + assert table[1, a.Lorg, a.Borg] == 0 + assert table[1, a.Lorg, a.Iper] == 0 + assert table[1, a.Lorg, a.Iloc] == 0 + assert table[1, a.Lorg, a.Iorg] == 0 + assert table[1, a.Lorg, a.Lper] == 0 + assert table[1, a.Lorg, a.Lloc] == 0 + assert table[1, a.Lorg, a.Lorg] == 0 + assert table[1, a.Lorg, a.Uper] == 1 + assert table[1, a.Lorg, a.Uloc] == 1 + assert table[1, a.Lorg, a.Uorg] == 1 + assert table[1, a.Lorg, a.O] == 1 + + # Last token, prev action was U + assert table[1, a.Uper, a.Bper] == 0 + assert table[1, a.Uper, a.Bloc] == 0 + assert table[1, a.Uper, a.Borg] == 0 + assert table[1, a.Uper, a.Iper] == 0 + assert table[1, a.Uper, a.Iloc] == 0 + assert table[1, a.Uper, a.Iorg] == 0 + assert table[1, a.Uper, a.Lper] == 0 + assert table[1, a.Uper, a.Lloc] == 0 + assert table[1, a.Uper, a.Lorg] == 0 + assert table[1, a.Uper, a.Uper] == 1 + assert table[1, a.Uper, a.Uloc] == 1 + assert table[1, a.Uper, a.Uorg] == 1 + assert table[1, a.Uper, a.O] == 1 + + assert table[1, a.Uloc, a.Bper] == 0 + assert table[1, a.Uloc, a.Bloc] == 0 + assert table[1, a.Uloc, a.Borg] == 0 + assert table[1, a.Uloc, a.Iper] == 0 + assert table[1, a.Uloc, a.Iloc] == 0 + assert table[1, a.Uloc, a.Iorg] == 0 + assert table[1, a.Uloc, a.Lper] == 0 + assert table[1, a.Uloc, a.Lloc] == 0 + assert table[1, a.Uloc, a.Lorg] == 0 + assert table[1, a.Uloc, a.Uper] == 1 + assert table[1, a.Uloc, a.Uloc] == 1 + assert table[1, a.Uloc, a.Uorg] == 1 + assert table[1, a.Uloc, a.O] == 1 + + assert table[1, a.Uorg, a.Bper] == 0 + assert table[1, a.Uorg, a.Bloc] == 0 + assert table[1, a.Uorg, a.Borg] == 0 + assert table[1, a.Uorg, a.Iper] == 0 + assert table[1, a.Uorg, a.Iloc] == 0 + assert table[1, a.Uorg, a.Iorg] == 0 + assert table[1, a.Uorg, a.Lper] == 0 + assert table[1, a.Uorg, a.Lloc] == 0 + assert table[1, a.Uorg, a.Lorg] == 0 + assert table[1, a.Uorg, a.Uper] == 1 + assert table[1, a.Uorg, a.Uloc] == 1 + assert table[1, a.Uorg, a.Uorg] == 1 + assert table[1, a.Uorg, a.O] == 1 + + # Last token, prev action was O + assert table[1, a.O, a.Bper] == 0 + assert table[1, a.O, a.Bloc] == 0 + assert table[1, a.O, a.Borg] == 0 + assert table[1, a.O, a.Iper] == 0 + assert table[1, a.O, a.Iloc] == 0 + assert table[1, a.O, a.Iorg] == 0 + assert table[1, a.O, a.Lper] == 0 + assert table[1, a.O, a.Lloc] == 0 + assert table[1, a.O, a.Lorg] == 0 + assert table[1, a.O, a.Uper] == 1 + assert table[1, a.O, a.Uloc] == 1 + assert table[1, a.O, a.Uorg] == 1 + assert table[1, a.O, a.O] == 1 diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index d9a3e16b6..67966f70e 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -34,7 +34,8 @@ def test_issue2179(): nlp2.add_pipe(nlp2.create_pipe("ner")) assert len(nlp2.get_pipe("ner").labels) == 0 - nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves) + model = nlp2.get_pipe("ner").model + model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index df23efa4f..06ba6c4ac 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -104,7 +104,8 @@ def test_issue3209(): assert ner.move_names == move_names nlp2 = English() nlp2.add_pipe(nlp2.create_pipe("ner")) - nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves) + model = nlp2.get_pipe("ner").model + model.attrs["resize_output"](model, ner.moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert nlp2.get_pipe("ner").move_names == move_names diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 298cddc74..ba63adfa4 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -110,10 +110,9 @@ def test_serialize_custom_nlp(): nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model tok2vec = model.get_ref("tok2vec") - upper = model.upper + upper = model.get_ref("upper") # check that we have the correct settings, not the default ones - assert tok2vec.get_dim("nO") == 321 assert upper.get_dim("nI") == 65 @@ -131,8 +130,7 @@ def test_serialize_parser(): nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model tok2vec = model.get_ref("tok2vec") - upper = model.upper + upper = model.get_ref("upper") # check that we have the correct settings, not the default ones assert upper.get_dim("nI") == 66 - assert tok2vec.get_dim("nO") == 333 diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index a3381cb2f..475181c7b 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -63,7 +63,7 @@ def test_to_from_bytes(parser, blank_parser): bytes_data = parser.to_bytes(exclude=["vocab"]) # the blank parser needs to be resized before we can call from_bytes - blank_parser.model.resize_output(parser.moves.n_moves) + blank_parser.model.attrs["resize_output"](blank_parser.model, parser.moves.n_moves) blank_parser.from_bytes(bytes_data) assert blank_parser.model is not True assert blank_parser.moves.n_moves == parser.moves.n_moves diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 1200407d7..c320b19c0 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -38,7 +38,7 @@ def test_util_get_package_path(package): def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): - model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP) + model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize() assert model.get_param("W").shape == (nF, nO, nP, nI) tensor = model.ops.alloc((10, nI)) Y, get_dX = model.begin_update(tensor) diff --git a/spacy/util.py b/spacy/util.py index b4ecc8b03..048d923ee 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -571,8 +571,10 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, tuples=True, count_words=len): - """Create minibatches of a given number of words.""" +def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2): + """Create minibatches of roughly a given number of words. If any examples + are longer than the specified batch length, they will appear in a batch by + themselves.""" if isinstance(size, int): size_ = itertools.repeat(size) elif isinstance(size, List): @@ -580,18 +582,36 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len): else: size_ = size examples = iter(examples) + oversize = [] while True: batch_size = next(size_) + tol_size = batch_size * 0.2 batch = [] - while batch_size >= 0: + if oversize: + example = oversize.pop(0) + n_words = count_words(example.doc) + batch.append(example) + batch_size -= n_words + while batch_size >= 1: try: example = next(examples) except StopIteration: - if batch: - yield batch - return - batch_size -= count_words(example.doc) - batch.append(example) + if oversize: + examples = iter(oversize) + oversize = [] + if batch: + yield batch + break + else: + if batch: + yield batch + return + n_words = count_words(example.doc) + if n_words < (batch_size + tol_size): + batch_size -= n_words + batch.append(example) + else: + oversize.append(example) if batch: yield batch From 0d94737857d443bbce230605bb98492d063c6e80 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 18 May 2020 22:27:10 +0200 Subject: [PATCH 110/187] Feature toggle_pipes (#5378) * make disable_pipes deprecated in favour of the new toggle_pipes * rewrite disable_pipes statements * update documentation * remove bin/wiki_entity_linking folder * one more fix * remove deprecated link to documentation * few more doc fixes * add note about name change to the docs * restore original disable_pipes * small fixes * fix typo * fix error number to W096 * rename to select_pipes * also make changes to the documentation Co-authored-by: Matthew Honnibal --- bin/wiki_entity_linking/README.md | 37 -- bin/wiki_entity_linking/__init__.py | 12 - .../entity_linker_evaluation.py | 204 ------- bin/wiki_entity_linking/kb_creator.py | 161 ----- bin/wiki_entity_linking/train_descriptions.py | 145 ----- bin/wiki_entity_linking/wiki_io.py | 127 ---- bin/wiki_entity_linking/wiki_namespaces.py | 128 ---- .../wikidata_pretrain_kb.py | 179 ------ bin/wiki_entity_linking/wikidata_processor.py | 154 ----- .../wikidata_train_entity_linker.py | 230 ------- .../wikipedia_processor.py | 565 ------------------ examples/training/pretrain_textcat.py | 5 +- examples/training/rehearsal.py | 5 +- .../textcatjsonl_to_trainjson.py | 6 +- examples/training/train_entity_linker.py | 7 +- examples/training/train_intent_parser.py | 4 +- examples/training/train_ner.py | 5 +- examples/training/train_new_entity_type.py | 6 +- examples/training/train_parser.py | 5 +- examples/training/train_textcat.py | 5 +- spacy/cli/train.py | 6 +- spacy/errors.py | 11 +- spacy/language.py | 32 +- spacy/pipeline/entityruler.py | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 55 +- spacy/tests/regression/test_issue3611.py | 2 +- spacy/tests/regression/test_issue4030.py | 2 +- website/docs/api/language.md | 38 +- website/docs/usage/processing-pipelines.md | 18 +- website/docs/usage/rule-based-matching.md | 5 +- website/docs/usage/spacy-101.md | 3 +- website/docs/usage/training.md | 14 +- 32 files changed, 154 insertions(+), 2024 deletions(-) delete mode 100644 bin/wiki_entity_linking/README.md delete mode 100644 bin/wiki_entity_linking/__init__.py delete mode 100644 bin/wiki_entity_linking/entity_linker_evaluation.py delete mode 100644 bin/wiki_entity_linking/kb_creator.py delete mode 100644 bin/wiki_entity_linking/train_descriptions.py delete mode 100644 bin/wiki_entity_linking/wiki_io.py delete mode 100644 bin/wiki_entity_linking/wiki_namespaces.py delete mode 100644 bin/wiki_entity_linking/wikidata_pretrain_kb.py delete mode 100644 bin/wiki_entity_linking/wikidata_processor.py delete mode 100644 bin/wiki_entity_linking/wikidata_train_entity_linker.py delete mode 100644 bin/wiki_entity_linking/wikipedia_processor.py diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md deleted file mode 100644 index 4e4af5c21..000000000 --- a/bin/wiki_entity_linking/README.md +++ /dev/null @@ -1,37 +0,0 @@ -## Entity Linking with Wikipedia and Wikidata - -### Step 1: Create a Knowledge Base (KB) and training data - -Run `wikidata_pretrain_kb.py` -* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file** - * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/ - * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language) -* You can set the filtering parameters for KB construction: - * `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym - * `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB - * `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB -* Further parameters to set: - * `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`) - * `entity_vector_length` (`-v`): length of the pre-trained entity description vectors - * `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages) - -Quick testing and rerunning: -* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything. - * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1` -* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed. - - -### Step 2: Train an Entity Linking model - -Run `wikidata_train_entity_linker.py` -* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model** -* Specify the output directory (`-o`) in which the final, trained model will be saved -* You can set the learning parameters for the EL training: - * `epochs` (`-e`): number of training iterations - * `dropout` (`-p`): dropout rate - * `lr` (`-n`): learning rate - * `l2` (`-r`): L2 regularization -* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively - * If not specified, the full dataset will be processed - this may take a LONG time ! -* Further parameters to set: - * `labels_discard` (`-l`): NER label types to discard during training diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py deleted file mode 100644 index de486bbcf..000000000 --- a/bin/wiki_entity_linking/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -TRAINING_DATA_FILE = "gold_entities.jsonl" -KB_FILE = "kb" -KB_MODEL_DIR = "nlp_kb" -OUTPUT_MODEL_DIR = "nlp" - -PRIOR_PROB_PATH = "prior_prob.csv" -ENTITY_DEFS_PATH = "entity_defs.csv" -ENTITY_FREQ_PATH = "entity_freq.csv" -ENTITY_ALIAS_PATH = "entity_alias.csv" -ENTITY_DESCR_PATH = "entity_descriptions.csv" - -LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py deleted file mode 100644 index 2aeffbfc2..000000000 --- a/bin/wiki_entity_linking/entity_linker_evaluation.py +++ /dev/null @@ -1,204 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import logging -import random -from tqdm import tqdm -from collections import defaultdict - -logger = logging.getLogger(__name__) - - -class Metrics(object): - true_pos = 0 - false_pos = 0 - false_neg = 0 - - def update_results(self, true_entity, candidate): - candidate_is_correct = true_entity == candidate - - # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL") - # Therefore, if candidate_is_correct then we have a true positive and never a true negative. - self.true_pos += candidate_is_correct - self.false_neg += not candidate_is_correct - if candidate and candidate not in {"", "NIL"}: - # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN. - self.false_pos += not candidate_is_correct - - def calculate_precision(self): - if self.true_pos == 0: - return 0.0 - else: - return self.true_pos / (self.true_pos + self.false_pos) - - def calculate_recall(self): - if self.true_pos == 0: - return 0.0 - else: - return self.true_pos / (self.true_pos + self.false_neg) - - def calculate_fscore(self): - p = self.calculate_precision() - r = self.calculate_recall() - if p + r == 0: - return 0.0 - else: - return 2 * p * r / (p + r) - - -class EvaluationResults(object): - def __init__(self): - self.metrics = Metrics() - self.metrics_by_label = defaultdict(Metrics) - - def update_metrics(self, ent_label, true_entity, candidate): - self.metrics.update_results(true_entity, candidate) - self.metrics_by_label[ent_label].update_results(true_entity, candidate) - - def report_metrics(self, model_name): - model_str = model_name.title() - recall = self.metrics.calculate_recall() - precision = self.metrics.calculate_precision() - fscore = self.metrics.calculate_fscore() - return ( - "{}: ".format(model_str) - + "F-score = {} | ".format(round(fscore, 3)) - + "Recall = {} | ".format(round(recall, 3)) - + "Precision = {} | ".format(round(precision, 3)) - + "F-score by label = {}".format( - {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())} - ) - ) - - -class BaselineResults(object): - def __init__(self): - self.random = EvaluationResults() - self.prior = EvaluationResults() - self.oracle = EvaluationResults() - - def report_performance(self, model): - results = getattr(self, model) - return results.report_metrics(model) - - def update_baselines( - self, - true_entity, - ent_label, - random_candidate, - prior_candidate, - oracle_candidate, - ): - self.oracle.update_metrics(ent_label, true_entity, oracle_candidate) - self.prior.update_metrics(ent_label, true_entity, prior_candidate) - self.random.update_metrics(ent_label, true_entity, random_candidate) - - -def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None): - counts = dict() - baseline_results = BaselineResults() - context_results = EvaluationResults() - combo_results = EvaluationResults() - - for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'): - if len(doc) > 0: - correct_ents = dict() - for entity, kb_dict in gold.links.items(): - start, end = entity - for gold_kb, value in kb_dict.items(): - if value: - # only evaluating on positive examples - offset = _offset(start, end) - correct_ents[offset] = gold_kb - - if baseline: - _add_baseline(baseline_results, counts, doc, correct_ents, kb) - - if context: - # using only context - el_pipe.cfg["incl_context"] = True - el_pipe.cfg["incl_prior"] = False - _add_eval_result(context_results, doc, correct_ents, el_pipe) - - # measuring combined accuracy (prior + context) - el_pipe.cfg["incl_context"] = True - el_pipe.cfg["incl_prior"] = True - _add_eval_result(combo_results, doc, correct_ents, el_pipe) - - if baseline: - logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())})) - logger.info(baseline_results.report_performance("random")) - logger.info(baseline_results.report_performance("prior")) - logger.info(baseline_results.report_performance("oracle")) - - if context: - logger.info(context_results.report_metrics("context only")) - logger.info(combo_results.report_metrics("context and prior")) - - -def _add_eval_result(results, doc, correct_ents, el_pipe): - """ - Evaluate the ent.kb_id_ annotations against the gold standard. - Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. - """ - try: - doc = el_pipe(doc) - for ent in doc.ents: - ent_label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_ents.get(offset, None) - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - pred_entity = ent.kb_id_ - results.update_metrics(ent_label, gold_entity, pred_entity) - - except Exception as e: - logging.error("Error assessing accuracy " + str(e)) - - -def _add_baseline(baseline_results, counts, doc, correct_ents, kb): - """ - Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound. - Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. - """ - for ent in doc.ents: - ent_label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_ents.get(offset, None) - - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - candidates = kb.get_candidates(ent.text) - oracle_candidate = "" - prior_candidate = "" - random_candidate = "" - if candidates: - scores = [] - - for c in candidates: - scores.append(c.prior_prob) - if c.entity_ == gold_entity: - oracle_candidate = c.entity_ - - best_index = scores.index(max(scores)) - prior_candidate = candidates[best_index].entity_ - random_candidate = random.choice(candidates).entity_ - - current_count = counts.get(ent_label, 0) - counts[ent_label] = current_count+1 - - baseline_results.update_baselines( - gold_entity, - ent_label, - random_candidate, - prior_candidate, - oracle_candidate, - ) - - -def _offset(start, end): - return "{}_{}".format(start, end) diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py deleted file mode 100644 index 8691308e0..000000000 --- a/bin/wiki_entity_linking/kb_creator.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import logging - -from spacy.kb import KnowledgeBase - -from bin.wiki_entity_linking.train_descriptions import EntityEncoder -from bin.wiki_entity_linking import wiki_io as io - - -logger = logging.getLogger(__name__) - - -def create_kb( - nlp, - max_entities_per_alias, - min_entity_freq, - min_occ, - entity_def_path, - entity_descr_path, - entity_alias_path, - entity_freq_path, - prior_prob_path, - entity_vector_length, -): - # Create the knowledge base from Wikidata entries - kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) - entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length) - _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path) - return kb - - -def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length): - # read the mappings from file - title_to_id = io.read_title_to_id(entity_def_path) - id_to_descr = io.read_id_to_descr(entity_descr_path) - - # check the length of the nlp vectors - if "vectors" in nlp.meta and nlp.vocab.vectors.size: - input_dim = nlp.vocab.vectors_length - logger.info("Loaded pretrained vectors of size %s" % input_dim) - else: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq)) - entity_frequencies = io.read_entity_to_count(entity_freq_path) - # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise - filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( - title_to_id, - id_to_descr, - entity_frequencies, - min_entity_freq - ) - logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys()))) - - logger.info("Training entity encoder") - encoder = EntityEncoder(nlp, input_dim, entity_vector_length) - encoder.train(description_list=description_list, to_print=True) - - logger.info("Getting entity embeddings") - embeddings = encoder.apply_encoder(description_list) - - logger.info("Adding {} entities".format(len(entity_list))) - kb.set_entities( - entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings - ) - return entity_list, filtered_title_to_id - - -def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path): - logger.info("Adding aliases from Wikipedia and Wikidata") - _add_aliases( - kb, - entity_list=entity_list, - title_to_id=filtered_title_to_id, - max_entities_per_alias=max_entities_per_alias, - min_occ=min_occ, - prior_prob_path=prior_prob_path, - ) - - -def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies, - min_entity_freq: int = 10): - filtered_title_to_id = dict() - entity_list = [] - description_list = [] - frequency_list = [] - for title, entity in title_to_id.items(): - freq = entity_frequencies.get(title, 0) - desc = id_to_descr.get(entity, None) - if desc and freq > min_entity_freq: - entity_list.append(entity) - description_list.append(desc) - frequency_list.append(freq) - filtered_title_to_id[title] = entity - return filtered_title_to_id, entity_list, description_list, frequency_list - - -def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path): - wp_titles = title_to_id.keys() - - # adding aliases with prior probabilities - # we can read this file sequentially, it's sorted by alias, and then by count - logger.info("Adding WP aliases") - with prior_prob_path.open("r", encoding="utf8") as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - previous_alias = None - total_count = 0 - counts = [] - entities = [] - while line: - splits = line.replace("\n", "").split(sep="|") - new_alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - if new_alias != previous_alias and previous_alias: - # done reading the previous alias --> output - if len(entities) > 0: - selected_entities = [] - prior_probs = [] - for ent_count, ent_string in zip(counts, entities): - if ent_string in wp_titles: - wd_id = title_to_id[ent_string] - p_entity_givenalias = ent_count / total_count - selected_entities.append(wd_id) - prior_probs.append(p_entity_givenalias) - - if selected_entities: - try: - kb.add_alias( - alias=previous_alias, - entities=selected_entities, - probabilities=prior_probs, - ) - except ValueError as e: - logger.error(e) - total_count = 0 - counts = [] - entities = [] - - total_count += count - - if len(entities) < max_entities_per_alias and count >= min_occ: - counts.append(count) - entities.append(entity) - previous_alias = new_alias - - line = prior_file.readline() - - -def read_kb(nlp, kb_file): - kb = KnowledgeBase(vocab=nlp.vocab) - kb.load_bulk(kb_file) - return kb diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py deleted file mode 100644 index b0cfbb4c6..000000000 --- a/bin/wiki_entity_linking/train_descriptions.py +++ /dev/null @@ -1,145 +0,0 @@ -from random import shuffle - -import logging -import numpy as np - -from thinc.api import Model, chain, CosineDistance, Linear - -from spacy.util import create_default_optimizer - -logger = logging.getLogger(__name__) - - -class EntityEncoder: - """ - Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D). - This entity vector will be stored in the KB, for further downstream use in the entity model. - """ - - DROP = 0 - BATCH_SIZE = 1000 - - # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy - MIN_LOSS = 0.01 - - # Reasonable default to stop training when things are not improving - MAX_NO_IMPROVEMENT = 20 - - def __init__(self, nlp, input_dim, desc_width, epochs=5): - self.nlp = nlp - self.input_dim = input_dim - self.desc_width = desc_width - self.epochs = epochs - self.distance = CosineDistance(ignore_zeros=True, normalize=False) - - def apply_encoder(self, description_list): - if self.encoder is None: - raise ValueError("Can not apply encoder before training it") - - batch_size = 100000 - - start = 0 - stop = min(batch_size, len(description_list)) - encodings = [] - - while start < len(description_list): - docs = list(self.nlp.pipe(description_list[start:stop])) - doc_embeddings = [self._get_doc_embedding(doc) for doc in docs] - enc = self.encoder(np.asarray(doc_embeddings)) - encodings.extend(enc.tolist()) - - start = start + batch_size - stop = min(stop + batch_size, len(description_list)) - logger.info("Encoded: {} entities".format(stop)) - - return encodings - - def train(self, description_list, to_print=False): - processed, loss = self._train_model(description_list) - if to_print: - logger.info( - "Trained entity descriptions on {} ".format(processed) + - "(non-unique) descriptions across {} ".format(self.epochs) + - "epochs" - ) - logger.info("Final loss: {}".format(loss)) - - def _train_model(self, description_list): - best_loss = 1.0 - iter_since_best = 0 - self._build_network(self.input_dim, self.desc_width) - - processed = 0 - loss = 1 - # copy this list so that shuffling does not affect other functions - descriptions = description_list.copy() - to_continue = True - - for i in range(self.epochs): - shuffle(descriptions) - - batch_nr = 0 - start = 0 - stop = min(self.BATCH_SIZE, len(descriptions)) - - while to_continue and start < len(descriptions): - batch = [] - for descr in descriptions[start:stop]: - doc = self.nlp(descr) - doc_vector = self._get_doc_embedding(doc) - batch.append(doc_vector) - - loss = self._update(batch) - if batch_nr % 25 == 0: - logger.info("loss: {} ".format(loss)) - processed += len(batch) - - # in general, continue training if we haven't reached our ideal min yet - to_continue = loss > self.MIN_LOSS - - # store the best loss and track how long it's been - if loss < best_loss: - best_loss = loss - iter_since_best = 0 - else: - iter_since_best += 1 - - # stop learning if we haven't seen improvement since the last few iterations - if iter_since_best > self.MAX_NO_IMPROVEMENT: - to_continue = False - - batch_nr += 1 - start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(descriptions)) - - return processed, loss - - @staticmethod - def _get_doc_embedding(doc): - indices = np.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in doc.vocab.vectors.key2row: - indices[i] = doc.vocab.vectors.key2row[word.orth] - else: - indices[i] = 0 - word_vectors = doc.vocab.vectors.data[indices] - doc_vector = np.mean(word_vectors, axis=0) - return doc_vector - - def _build_network(self, orig_width, hidden_with): - with Model.define_operators({">>": chain}): - # very simple encoder-decoder model - self.encoder = Linear(hidden_with, orig_width) - # TODO: removed the zero_init here - is oK? - self.model = self.encoder >> Linear(orig_width, hidden_with) - self.sgd = create_default_optimizer() - - def _update(self, vectors): - truths = self.model.ops.asarray(vectors) - predictions, bp_model = self.model.begin_update( - truths, drop=self.DROP - ) - d_scores, loss = self.distance(predictions, truths) - bp_model(d_scores, sgd=self.sgd) - return loss / len(vectors) - diff --git a/bin/wiki_entity_linking/wiki_io.py b/bin/wiki_entity_linking/wiki_io.py deleted file mode 100644 index 43ae87f0f..000000000 --- a/bin/wiki_entity_linking/wiki_io.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import sys -import csv - -# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/ -csv.field_size_limit(min(sys.maxsize, 2147483646)) - -""" This class provides reading/writing methods for temp files """ - - -# Entity definition: WP title -> WD ID # -def write_title_to_id(entity_def_output, title_to_id): - with entity_def_output.open("w", encoding="utf8") as id_file: - id_file.write("WP_title" + "|" + "WD_id" + "\n") - for title, qid in title_to_id.items(): - id_file.write(title + "|" + str(qid) + "\n") - - -def read_title_to_id(entity_def_output): - title_to_id = dict() - with entity_def_output.open("r", encoding="utf8") as id_file: - csvreader = csv.reader(id_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - title_to_id[row[0]] = row[1] - return title_to_id - - -# Entity aliases from WD: WD ID -> WD alias # -def write_id_to_alias(entity_alias_path, id_to_alias): - with entity_alias_path.open("w", encoding="utf8") as alias_file: - alias_file.write("WD_id" + "|" + "alias" + "\n") - for qid, alias_list in id_to_alias.items(): - for alias in alias_list: - alias_file.write(str(qid) + "|" + alias + "\n") - - -def read_id_to_alias(entity_alias_path): - id_to_alias = dict() - with entity_alias_path.open("r", encoding="utf8") as alias_file: - csvreader = csv.reader(alias_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - qid = row[0] - alias = row[1] - alias_list = id_to_alias.get(qid, []) - alias_list.append(alias) - id_to_alias[qid] = alias_list - return id_to_alias - - -def read_alias_to_id_generator(entity_alias_path): - """ Read (aliases, qid) tuples """ - - with entity_alias_path.open("r", encoding="utf8") as alias_file: - csvreader = csv.reader(alias_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - qid = row[0] - alias = row[1] - yield alias, qid - - -# Entity descriptions from WD: WD ID -> WD alias # -def write_id_to_descr(entity_descr_output, id_to_descr): - with entity_descr_output.open("w", encoding="utf8") as descr_file: - descr_file.write("WD_id" + "|" + "description" + "\n") - for qid, descr in id_to_descr.items(): - descr_file.write(str(qid) + "|" + descr + "\n") - - -def read_id_to_descr(entity_desc_path): - id_to_desc = dict() - with entity_desc_path.open("r", encoding="utf8") as descr_file: - csvreader = csv.reader(descr_file, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - id_to_desc[row[0]] = row[1] - return id_to_desc - - -# Entity counts from WP: WP title -> count # -def write_entity_to_count(prior_prob_input, count_output): - # Write entity counts for quick access later - entity_to_count = dict() - total_count = 0 - - with prior_prob_input.open("r", encoding="utf8") as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - - while line: - splits = line.replace("\n", "").split(sep="|") - # alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - current_count = entity_to_count.get(entity, 0) - entity_to_count[entity] = current_count + count - - total_count += count - - line = prior_file.readline() - - with count_output.open("w", encoding="utf8") as entity_file: - entity_file.write("entity" + "|" + "count" + "\n") - for entity, count in entity_to_count.items(): - entity_file.write(entity + "|" + str(count) + "\n") - - -def read_entity_to_count(count_input): - entity_to_count = dict() - with count_input.open("r", encoding="utf8") as csvfile: - csvreader = csv.reader(csvfile, delimiter="|") - # skip header - next(csvreader) - for row in csvreader: - entity_to_count[row[0]] = int(row[1]) - - return entity_to_count diff --git a/bin/wiki_entity_linking/wiki_namespaces.py b/bin/wiki_entity_linking/wiki_namespaces.py deleted file mode 100644 index e8f099ccd..000000000 --- a/bin/wiki_entity_linking/wiki_namespaces.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -# List of meta pages in Wikidata, should be kept out of the Knowledge base -WD_META_ITEMS = [ - "Q163875", - "Q191780", - "Q224414", - "Q4167836", - "Q4167410", - "Q4663903", - "Q11266439", - "Q13406463", - "Q15407973", - "Q18616576", - "Q19887878", - "Q22808320", - "Q23894233", - "Q33120876", - "Q42104522", - "Q47460393", - "Q64875536", - "Q66480449", -] - - -# TODO: add more cases from non-English WP's - -# List of prefixes that refer to Wikipedia "file" pages -WP_FILE_NAMESPACE = ["Bestand", "File"] - -# List of prefixes that refer to Wikipedia "category" pages -WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"] - -# List of prefixes that refer to Wikipedia "meta" pages -# these will/should be matched ignoring case -WP_META_NAMESPACE = ( - WP_FILE_NAMESPACE - + WP_CATEGORY_NAMESPACE - + [ - "b", - "betawikiversity", - "Book", - "c", - "Commons", - "d", - "dbdump", - "download", - "Draft", - "Education", - "Foundation", - "Gadget", - "Gadget definition", - "Gebruiker", - "gerrit", - "Help", - "Image", - "Incubator", - "m", - "mail", - "mailarchive", - "media", - "MediaWiki", - "MediaWiki talk", - "Mediawikiwiki", - "MediaZilla", - "Meta", - "Metawikipedia", - "Module", - "mw", - "n", - "nost", - "oldwikisource", - "otrs", - "OTRSwiki", - "Overleg gebruiker", - "outreach", - "outreachwiki", - "Portal", - "phab", - "Phabricator", - "Project", - "q", - "quality", - "rev", - "s", - "spcom", - "Special", - "species", - "Strategy", - "sulutil", - "svn", - "Talk", - "Template", - "Template talk", - "Testwiki", - "ticket", - "TimedText", - "Toollabs", - "tools", - "tswiki", - "User", - "User talk", - "v", - "voy", - "w", - "Wikibooks", - "Wikidata", - "wikiHow", - "Wikinvest", - "wikilivres", - "Wikimedia", - "Wikinews", - "Wikipedia", - "Wikipedia talk", - "Wikiquote", - "Wikisource", - "Wikispecies", - "Wikitech", - "Wikiversity", - "Wikivoyage", - "wikt", - "wiktionary", - "wmf", - "wmania", - "WP", - ] -) diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py deleted file mode 100644 index 003074feb..000000000 --- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py +++ /dev/null @@ -1,179 +0,0 @@ -# coding: utf-8 -"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB) -with specific parameters. Intermediate files are written to disk. - -Running the full pipeline on a standard laptop, may take up to 13 hours of processing. -Use the -p, -d and -s options to speed up processing using the intermediate files -from a previous run. - -For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ -For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2 -from https://dumps.wikimedia.org/enwiki/latest/ - -""" -from __future__ import unicode_literals - -import logging -from pathlib import Path -import plac - -from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd -from bin.wiki_entity_linking import wiki_io as io -from bin.wiki_entity_linking import kb_creator -from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT -from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH -import spacy -from bin.wiki_entity_linking.kb_creator import read_kb - -logger = logging.getLogger(__name__) - - -@plac.annotations( - wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path), - wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path), - output_dir=("Output directory", "positional", None, Path), - model=("Model name or path, should include pretrained vectors.", "positional", None, str), - max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int), - min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int), - min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int), - entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int), - loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path), - loc_entity_defs=("Location to file with entity definitions", "option", "d", Path), - loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path), - descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"), - limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int), - limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int), - limit_wd=("Threshold to limit lines read from WD", "option", "lw", int), - lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str), -) -def main( - wd_json, - wp_xml, - output_dir, - model, - max_per_alias=10, - min_freq=20, - min_pair=5, - entity_vector_length=64, - loc_prior_prob=None, - loc_entity_defs=None, - loc_entity_alias=None, - loc_entity_desc=None, - descr_from_wp=False, - limit_prior=None, - limit_train=None, - limit_wd=None, - lang="en", -): - entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH - entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH - entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH - entity_freq_path = output_dir / ENTITY_FREQ_PATH - prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH - training_entities_path = output_dir / TRAINING_DATA_FILE - kb_path = output_dir / KB_FILE - - logger.info("Creating KB with Wikipedia and WikiData") - - # STEP 0: set up IO - if not output_dir.exists(): - output_dir.mkdir(parents=True) - - # STEP 1: Load the NLP object - logger.info("STEP 1: Loading NLP model {}".format(model)) - nlp = spacy.load(model) - - # check the length of the nlp vectors - if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - # STEP 2: create prior probabilities from WP - if not prior_prob_path.exists(): - # It takes about 2h to process 1000M lines of Wikipedia XML dump - logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path)) - if limit_prior is not None: - logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior)) - wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior) - else: - logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path)) - - # STEP 3: calculate entity frequencies - if not entity_freq_path.exists(): - logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path)) - io.write_entity_to_count(prior_prob_path, entity_freq_path) - else: - logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path)) - - # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file - if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()): - # It takes about 10h to process 55M lines of Wikidata JSON dump - logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path)) - if limit_wd is not None: - logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd)) - title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json( - wd_json, - limit_wd, - to_print=False, - lang=lang, - parse_descr=(not descr_from_wp), - ) - io.write_title_to_id(entity_defs_path, title_to_id) - - logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path)) - io.write_id_to_alias(entity_alias_path, id_to_alias) - - if not descr_from_wp: - logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path)) - io.write_id_to_descr(entity_descr_path, id_to_descr) - else: - logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path)) - logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path)) - if not descr_from_wp: - logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path)) - - # STEP 5: Getting gold entities from Wikipedia - if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()): - logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path)) - if limit_train is not None: - logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train)) - wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path, - training_entities_path, descr_from_wp, limit_train) - if descr_from_wp: - logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path)) - else: - logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path)) - if descr_from_wp: - logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path)) - - # STEP 6: creating the actual KB - # It takes ca. 30 minutes to pretrain the entity embeddings - if not kb_path.exists(): - logger.info("STEP 6: Creating the KB at {}".format(kb_path)) - kb = kb_creator.create_kb( - nlp=nlp, - max_entities_per_alias=max_per_alias, - min_entity_freq=min_freq, - min_occ=min_pair, - entity_def_path=entity_defs_path, - entity_descr_path=entity_descr_path, - entity_alias_path=entity_alias_path, - entity_freq_path=entity_freq_path, - prior_prob_path=prior_prob_path, - entity_vector_length=entity_vector_length, - ) - kb.dump(kb_path) - logger.info("kb entities: {}".format(kb.get_size_entities())) - logger.info("kb aliases: {}".format(kb.get_size_aliases())) - nlp.to_disk(output_dir / KB_MODEL_DIR) - else: - logger.info("STEP 6: KB already exists at {}".format(kb_path)) - - logger.info("Done!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) - plac.call(main) diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py deleted file mode 100644 index 8a070f567..000000000 --- a/bin/wiki_entity_linking/wikidata_processor.py +++ /dev/null @@ -1,154 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import bz2 -import json -import logging - -from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS - -logger = logging.getLogger(__name__) - - -def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True): - # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines. - # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ - - site_filter = '{}wiki'.format(lang) - - # filter: currently defined as OR: one hit suffices to be removed from further processing - exclude_list = WD_META_ITEMS - - # punctuation - exclude_list.extend(["Q1383557", "Q10617810"]) - - # letters etc - exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"]) - - neg_prop_filter = { - 'P31': exclude_list, # instance of - 'P279': exclude_list # subclass - } - - title_to_id = dict() - id_to_descr = dict() - id_to_alias = dict() - - # parse appropriate fields - depending on what we need in the KB - parse_properties = False - parse_sitelinks = True - parse_labels = False - parse_aliases = True - parse_claims = True - - with bz2.open(wikidata_file, mode='rb') as file: - for cnt, line in enumerate(file): - if limit and cnt >= limit: - break - if cnt % 500000 == 0 and cnt > 0: - logger.info("processed {} lines of WikiData JSON dump".format(cnt)) - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - obj = json.loads(clean_line) - entry_type = obj["type"] - - if entry_type == "item": - keep = True - - claims = obj["claims"] - if parse_claims: - for prop, value_set in neg_prop_filter.items(): - claim_property = claims.get(prop, None) - if claim_property: - for cp in claim_property: - cp_id = ( - cp["mainsnak"] - .get("datavalue", {}) - .get("value", {}) - .get("id") - ) - cp_rank = cp["rank"] - if cp_rank != "deprecated" and cp_id in value_set: - keep = False - - if keep: - unique_id = obj["id"] - - if to_print: - print("ID:", unique_id) - print("type:", entry_type) - - # parsing all properties that refer to other entities - if parse_properties: - for prop, claim_property in claims.items(): - cp_dicts = [ - cp["mainsnak"]["datavalue"].get("value") - for cp in claim_property - if cp["mainsnak"].get("datavalue") - ] - cp_values = [ - cp_dict.get("id") - for cp_dict in cp_dicts - if isinstance(cp_dict, dict) - if cp_dict.get("id") is not None - ] - if cp_values: - if to_print: - print("prop:", prop, cp_values) - - found_link = False - if parse_sitelinks: - site_value = obj["sitelinks"].get(site_filter, None) - if site_value: - site = site_value["title"] - if to_print: - print(site_filter, ":", site) - title_to_id[site] = unique_id - found_link = True - - if parse_labels: - labels = obj["labels"] - if labels: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print( - "label (" + lang + "):", lang_label["value"] - ) - - if found_link and parse_descr: - descriptions = obj["descriptions"] - if descriptions: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print( - "description (" + lang + "):", - lang_descr["value"], - ) - id_to_descr[unique_id] = lang_descr["value"] - - if parse_aliases: - aliases = obj["aliases"] - if aliases: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print( - "alias (" + lang + "):", item["value"] - ) - alias_list = id_to_alias.get(unique_id, []) - alias_list.append(item["value"]) - id_to_alias[unique_id] = alias_list - - if to_print: - print() - - # log final number of lines processed - logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt)) - return title_to_id, id_to_descr, id_to_alias - - diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py deleted file mode 100644 index af0e68768..000000000 --- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py +++ /dev/null @@ -1,230 +0,0 @@ -# coding: utf-8 -"""Script that takes a previously created Knowledge Base and trains an entity linking -pipeline. The provided KB directory should hold the kb, the original nlp object and -its vocab used to create the KB, and a few auxiliary files such as the entity definitions, -as created by the script `wikidata_create_kb`. - -For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2 -from https://dumps.wikimedia.org/enwiki/latest/ -""" -from __future__ import unicode_literals - -import random -import logging -import spacy -from pathlib import Path -import plac -from tqdm import tqdm - -from bin.wiki_entity_linking import wikipedia_processor -from bin.wiki_entity_linking import ( - TRAINING_DATA_FILE, - KB_MODEL_DIR, - KB_FILE, - LOG_FORMAT, - OUTPUT_MODEL_DIR, -) -from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance -from bin.wiki_entity_linking.kb_creator import read_kb - -from spacy.util import minibatch, compounding - -logger = logging.getLogger(__name__) - - -@plac.annotations( - dir_kb=("Directory with KB, NLP and related files", "positional", None, Path), - output_dir=("Output directory", "option", "o", Path), - loc_training=("Location to training data", "option", "k", Path), - epochs=("Number of training iterations (default 10)", "option", "e", int), - dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float), - lr=("Learning rate (default 0.005)", "option", "n", float), - l2=("L2 regularization", "option", "r", float), - train_articles=("# training articles (default 90% of all)", "option", "t", int), - dev_articles=("# dev test articles (default 10% of all)", "option", "d", int), - labels_discard=("NER labels to discard (default None)", "option", "l", str), -) -def main( - dir_kb, - output_dir=None, - loc_training=None, - epochs=10, - dropout=0.5, - lr=0.005, - l2=1e-6, - train_articles=None, - dev_articles=None, - labels_discard=None, -): - if not output_dir: - logger.warning( - "No output dir specified so no results will be written, are you sure about this ?" - ) - - logger.info("Creating Entity Linker with Wikipedia and WikiData") - - output_dir = Path(output_dir) if output_dir else dir_kb - training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE - nlp_dir = dir_kb / KB_MODEL_DIR - kb_path = dir_kb / KB_FILE - nlp_output_dir = output_dir / OUTPUT_MODEL_DIR - - # STEP 0: set up IO - if not output_dir.exists(): - output_dir.mkdir() - - # STEP 1 : load the NLP object - logger.info("STEP 1a: Loading model from {}".format(nlp_dir)) - nlp = spacy.load(nlp_dir) - logger.info( - "Original NLP pipeline has following pipeline components: {}".format( - nlp.pipe_names - ) - ) - - # check that there is a NER component in the pipeline - if "ner" not in nlp.pipe_names: - raise ValueError("The `nlp` object should have a pretrained `ner` component.") - - logger.info("STEP 1b: Loading KB from {}".format(kb_path)) - kb = read_kb(nlp, kb_path) - - # STEP 2: read the training dataset previously created from WP - logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path)) - train_indices, dev_indices = wikipedia_processor.read_training_indices( - training_path - ) - logger.info( - "Training set has {} articles, limit set to roughly {} articles per epoch".format( - len(train_indices), train_articles if train_articles else "all" - ) - ) - logger.info( - "Dev set has {} articles, limit set to rougly {} articles for evaluation".format( - len(dev_indices), dev_articles if dev_articles else "all" - ) - ) - if dev_articles: - dev_indices = dev_indices[0:dev_articles] - - # STEP 3: create and train an entity linking pipe - logger.info( - "STEP 3: Creating and training an Entity Linking pipe for {} epochs".format( - epochs - ) - ) - if labels_discard: - labels_discard = [x.strip() for x in labels_discard.split(",")] - logger.info( - "Discarding {} NER types: {}".format(len(labels_discard), labels_discard) - ) - else: - labels_discard = [] - - el_pipe = nlp.create_pipe( - name="entity_linker", - config={ - "pretrained_vectors": nlp.vocab.vectors, - "labels_discard": labels_discard, - }, - ) - el_pipe.set_kb(kb) - nlp.add_pipe(el_pipe, last=True) - - other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] - with nlp.disable_pipes(*other_pipes): # only train Entity Linking - optimizer = nlp.begin_training() - optimizer.learn_rate = lr - optimizer.L2 = l2 - - logger.info("Dev Baseline Accuracies:") - dev_data = wikipedia_processor.read_el_docs_golds( - nlp=nlp, - entity_file_path=training_path, - dev=True, - line_ids=dev_indices, - kb=kb, - labels_discard=labels_discard, - ) - - measure_performance( - dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices) - ) - - for itn in range(epochs): - random.shuffle(train_indices) - losses = {} - batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001)) - batchnr = 0 - articles_processed = 0 - - # we either process the whole training file, or just a part each epoch - bar_total = len(train_indices) - if train_articles: - bar_total = train_articles - - with tqdm(total=bar_total, leave=False, desc=f"Epoch {itn}") as pbar: - for batch in batches: - if not train_articles or articles_processed < train_articles: - with nlp.disable_pipes("entity_linker"): - train_batch = wikipedia_processor.read_el_docs_golds( - nlp=nlp, - entity_file_path=training_path, - dev=False, - line_ids=batch, - kb=kb, - labels_discard=labels_discard, - ) - try: - with nlp.disable_pipes(*other_pipes): - nlp.update( - examples=train_batch, - sgd=optimizer, - drop=dropout, - losses=losses, - ) - batchnr += 1 - articles_processed += len(docs) - pbar.update(len(docs)) - except Exception as e: - logger.error("Error updating batch:" + str(e)) - if batchnr > 0: - logging.info( - "Epoch {} trained on {} articles, train loss {}".format( - itn, articles_processed, round(losses["entity_linker"] / batchnr, 2) - ) - ) - # re-read the dev_data (data is returned as a generator) - dev_data = wikipedia_processor.read_el_docs_golds( - nlp=nlp, - entity_file_path=training_path, - dev=True, - line_ids=dev_indices, - kb=kb, - labels_discard=labels_discard, - ) - measure_performance( - dev_data, - kb, - el_pipe, - baseline=False, - context=True, - dev_limit=len(dev_indices), - ) - - if output_dir: - # STEP 4: write the NLP pipeline (now including an EL model) to file - logger.info( - "Final NLP pipeline has following pipeline components: {}".format( - nlp.pipe_names - ) - ) - logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir)) - nlp.to_disk(nlp_output_dir) - - logger.info("Done!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) - plac.call(main) diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py deleted file mode 100644 index 315b1e916..000000000 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ /dev/null @@ -1,565 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import bz2 -import logging -import random -import json - -from spacy.gold import GoldParse -from bin.wiki_entity_linking import wiki_io as io -from bin.wiki_entity_linking.wiki_namespaces import ( - WP_META_NAMESPACE, - WP_FILE_NAMESPACE, - WP_CATEGORY_NAMESPACE, -) - -""" -Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions. -Write these results to file for downstream KB and training data generation. - -Process Wikipedia interlinks to generate a training dataset for the EL algorithm. -""" - -ENTITY_FILE = "gold_entities.csv" - -map_alias_to_link = dict() - -logger = logging.getLogger(__name__) - -title_regex = re.compile(r"(?<=).*(?=)") -id_regex = re.compile(r"(?<=)\d*(?=)") -text_regex = re.compile(r"(?<=).*(?= 0: - logger.info("processed {} lines of Wikipedia XML dump".format(cnt)) - clean_line = line.strip().decode("utf-8") - - # we attempt at reading the article's ID (but not the revision or contributor ID) - if "" in clean_line or "" in clean_line: - read_id = False - if "" in clean_line: - read_id = True - - if read_id: - ids = id_regex.search(clean_line) - if ids: - current_article_id = ids[0] - - # only processing prior probabilities from true training (non-dev) articles - if not is_dev(current_article_id): - aliases, entities, normalizations = get_wp_links(clean_line) - for alias, entity, norm in zip(aliases, entities, normalizations): - _store_alias( - alias, entity, normalize_alias=norm, normalize_entity=True - ) - - line = file.readline() - cnt += 1 - logger.info("processed {} lines of Wikipedia XML dump".format(cnt)) - logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt)) - - # write all aliases and their entities and count occurrences to file - with prior_prob_output.open("w", encoding="utf8") as outputfile: - outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") - for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): - s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True) - for entity, count in s_dict: - outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") - - -def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): - alias = alias.strip() - entity = entity.strip() - - # remove everything after # as this is not part of the title but refers to a specific paragraph - if normalize_entity: - # wikipedia titles are always capitalized - entity = _capitalize_first(entity.split("#")[0]) - if normalize_alias: - alias = alias.split("#")[0] - - if alias and entity: - alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(entity, 0) - alias_dict[entity] = entity_count + 1 - map_alias_to_link[alias] = alias_dict - - -def get_wp_links(text): - aliases = [] - entities = [] - normalizations = [] - - matches = link_regex.findall(text) - for match in matches: - match = match[2:][:-2].replace("_", " ").strip() - - if ns_regex.match(match): - pass # ignore the entity if it points to a "meta" page - - # this is a simple [[link]], with the alias the same as the mention - elif "|" not in match: - aliases.append(match) - entities.append(match) - normalizations.append(True) - - # in wiki format, the link is written as [[entity|alias]] - else: - splits = match.split("|") - entity = splits[0].strip() - alias = splits[1].strip() - # specific wiki format [[alias (specification)|]] - if len(alias) == 0 and "(" in entity: - alias = entity.split("(")[0] - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - else: - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - - return aliases, entities, normalizations - - -def _capitalize_first(text): - if not text: - return None - result = text[0].capitalize() - if len(result) > 0: - result += text[1:] - return result - - -def create_training_and_desc( - wp_input, def_input, desc_output, training_output, parse_desc, limit=None -): - wp_to_id = io.read_title_to_id(def_input) - _process_wikipedia_texts( - wp_input, wp_to_id, desc_output, training_output, parse_desc, limit - ) - - -def _process_wikipedia_texts( - wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None -): - """ - Read the XML wikipedia data to parse out training data: - raw text data + positive instances - """ - - read_ids = set() - - with output.open("a", encoding="utf8") as descr_file, training_output.open( - "w", encoding="utf8" - ) as entity_file: - if parse_descriptions: - _write_training_description(descr_file, "WD_id", "description") - with bz2.open(wikipedia_input, mode="rb") as file: - article_count = 0 - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - for line in file: - clean_line = line.strip().decode("utf-8") - - if clean_line == "": - reading_revision = True - elif clean_line == "": - reading_revision = False - - # Start reading new page - if clean_line == "": - article_text = "" - article_title = None - article_id = None - # finished reading this page - elif clean_line == "": - if article_id: - clean_text, entities = _process_wp_text( - article_title, article_text, wp_to_id - ) - if clean_text is not None and entities is not None: - _write_training_entities( - entity_file, article_id, clean_text, entities - ) - - if article_title in wp_to_id and parse_descriptions: - description = " ".join( - clean_text[:1000].split(" ")[:-1] - ) - _write_training_description( - descr_file, wp_to_id[article_title], description - ) - article_count += 1 - if article_count % 10000 == 0 and article_count > 0: - logger.info( - "Processed {} articles".format(article_count) - ) - if limit and article_count >= limit: - break - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - # start reading text within a page - if "") - clean_text = clean_text.replace(r""", '"') - clean_text = clean_text.replace(r"&nbsp;", " ") - clean_text = clean_text.replace(r"&", "&") - - # remove multiple spaces - while " " in clean_text: - clean_text = clean_text.replace(" ", " ") - - return clean_text.strip() - - -def _remove_links(clean_text, wp_to_id): - # read the text char by char to get the right offsets for the interwiki links - entities = [] - final_text = "" - open_read = 0 - reading_text = True - reading_entity = False - reading_mention = False - reading_special_case = False - entity_buffer = "" - mention_buffer = "" - for index, letter in enumerate(clean_text): - if letter == "[": - open_read += 1 - elif letter == "]": - open_read -= 1 - elif letter == "|": - if reading_text: - final_text += letter - # switch from reading entity to mention in the [[entity|mention]] pattern - elif reading_entity: - reading_text = False - reading_entity = False - reading_mention = True - else: - reading_special_case = True - else: - if reading_entity: - entity_buffer += letter - elif reading_mention: - mention_buffer += letter - elif reading_text: - final_text += letter - else: - raise ValueError("Not sure at point", clean_text[index - 2 : index + 2]) - - if open_read > 2: - reading_special_case = True - - if open_read == 2 and reading_text: - reading_text = False - reading_entity = True - reading_mention = False - - # we just finished reading an entity - if open_read == 0 and not reading_text: - if "#" in entity_buffer or entity_buffer.startswith(":"): - reading_special_case = True - # Ignore cases with nested structures like File: handles etc - if not reading_special_case: - if not mention_buffer: - mention_buffer = entity_buffer - start = len(final_text) - end = start + len(mention_buffer) - qid = wp_to_id.get(entity_buffer, None) - if qid: - entities.append((mention_buffer, qid, start, end)) - final_text += mention_buffer - - entity_buffer = "" - mention_buffer = "" - - reading_text = True - reading_entity = False - reading_mention = False - reading_special_case = False - return final_text, entities - - -def _write_training_description(outputfile, qid, description): - if description is not None: - line = str(qid) + "|" + description + "\n" - outputfile.write(line) - - -def _write_training_entities(outputfile, article_id, clean_text, entities): - entities_data = [ - {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]} - for ent in entities - ] - line = ( - json.dumps( - { - "article_id": article_id, - "clean_text": clean_text, - "entities": entities_data, - }, - ensure_ascii=False, - ) - + "\n" - ) - outputfile.write(line) - - -def read_training_indices(entity_file_path): - """ This method creates two lists of indices into the training file: one with indices for the - training examples, and one for the dev examples.""" - train_indices = [] - dev_indices = [] - - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): - example = json.loads(line) - article_id = example["article_id"] - clean_text = example["clean_text"] - - if is_valid_article(clean_text): - if is_dev(article_id): - dev_indices.append(i) - else: - train_indices.append(i) - - return train_indices, dev_indices - - -def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None): - """ This method provides training/dev examples that correspond to the entity annotations found by the nlp object. - For training, it will include both positive and negative examples by using the candidate generator from the kb. - For testing (kb=None), it will include all positive examples only.""" - if not labels_discard: - labels_discard = [] - - texts = [] - entities_list = [] - - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): - if i in line_ids: - example = json.loads(line) - article_id = example["article_id"] - clean_text = example["clean_text"] - entities = example["entities"] - - if dev != is_dev(article_id) or not is_valid_article(clean_text): - continue - - texts.append(clean_text) - entities_list.append(entities) - - docs = nlp.pipe(texts, batch_size=50) - - for doc, entities in zip(docs, entities_list): - gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) - if gold and len(gold.links) > 0: - yield doc, gold - - -def _get_gold_parse(doc, entities, dev, kb, labels_discard): - gold_entities = {} - tagged_ent_positions = { - (ent.start_char, ent.end_char): ent - for ent in doc.ents - if ent.label_ not in labels_discard - } - - for entity in entities: - entity_id = entity["entity"] - alias = entity["alias"] - start = entity["start"] - end = entity["end"] - - candidate_ids = [] - if kb and not dev: - candidates = kb.get_candidates(alias) - candidate_ids = [cand.entity_ for cand in candidates] - - tagged_ent = tagged_ent_positions.get((start, end), None) - if tagged_ent: - # TODO: check that alias == doc.text[start:end] - should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence( - tagged_ent.sent.text - ) - - if should_add_ent: - value_by_id = {entity_id: 1.0} - if not dev: - random.shuffle(candidate_ids) - value_by_id.update( - {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id} - ) - gold_entities[(start, end)] = value_by_id - - return GoldParse(doc, links=gold_entities) - - -def is_dev(article_id): - if not article_id: - return False - return article_id.endswith("3") - - -def is_valid_article(doc_text): - # custom length cut-off - return 10 < len(doc_text) < 30000 - - -def is_valid_sentence(sent_text): - if not 10 < len(sent_text) < 3000: - # custom length cut-off - return False - - if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"): - # remove 'enumeration' sentences (occurs often on Wikipedia) - return False - - return True diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index 0aefec9ef..5c41c0e92 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -129,10 +129,7 @@ def train_textcat(nlp, n_texts, n_iter=10): ) train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) - # get names of other pipes to disable them during training - pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train textcat + with nlp.select_pipes(enable="textcat"): # only train textcat optimizer = nlp.begin_training() textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) print("Training the model...") diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index a0455c0a9..24fc67ebb 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -62,11 +62,8 @@ def main(model_name, unlabelled_loc): optimizer.b1 = 0.0 optimizer.b2 = 0.0 - # get names of other pipes to disable them during training - pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] sizes = compounding(1.0, 4.0, 1.001) - with nlp.disable_pipes(*other_pipes): + with nlp.select_pipes(enable="ner"): for itn in range(n_iter): random.shuffle(TRAIN_DATA) random.shuffle(raw_docs) diff --git a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py index 339ce39be..66d96ff68 100644 --- a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py +++ b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py @@ -5,16 +5,17 @@ from spacy.gold import docs_to_json import srsly import sys + @plac.annotations( model=("Model name. Defaults to 'en'.", "option", "m", str), input_file=("Input file (jsonl)", "positional", None, Path), output_dir=("Output directory", "positional", None, Path), n_texts=("Number of texts to convert", "option", "t", int), ) -def convert(model='en', input_file=None, output_dir=None, n_texts=0): +def convert(model="en", input_file=None, output_dir=None, n_texts=0): # Load model with tokenizer + sentencizer only nlp = spacy.load(model) - nlp.disable_pipes(*nlp.pipe_names) + nlp.select_pipes(disable=nlp.pipe_names) sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer, first=True) @@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0): srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)]) + if __name__ == "__main__": plac.call(convert) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index 9776ad351..a22f255e7 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -97,7 +97,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: - with nlp.disable_pipes("entity_linker"): + with nlp.select_pipes(disable="entity_linker"): doc = nlp(text) annotation_clean = annotation for offset, kb_id_dict in annotation["links"].items(): @@ -112,10 +112,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): annotation_clean["links"][offset] = new_dict TRAIN_DOCS.append((doc, annotation_clean)) - # get names of other pipes to disable them during training - pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train entity linker + with nlp.select_pipes(enable="entity_linker"): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index bfec23d09..c3d5a279b 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -124,9 +124,7 @@ def main(model=None, output_dir=None, n_iter=15): for dep in annotations.get("deps", []): parser.add_label(dep) - pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train parser + with nlp.select_pipes(enable="parser"): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index d4e0bf794..f0f3affe7 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -55,10 +55,7 @@ def main(model=None, output_dir=None, n_iter=100): print("Add label", ent[2]) ner.add_label(ent[2]) - # get names of other pipes to disable them during training - pipe_exceptions = ["simple_ner"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train NER + with nlp.select_pipes(enable="ner"): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 47420e524..445c3fc27 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -94,10 +94,8 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): else: optimizer = nlp.resume_training() move_names = list(ner.move_names) - # get names of other pipes to disable them during training - pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train NER + + with nlp.select_pipes(enable="ner"): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 7bb3e8586..4f4409e31 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -64,10 +64,7 @@ def main(model=None, output_dir=None, n_iter=15): for dep in annotations.get("deps", []): parser.add_label(dep) - # get names of other pipes to disable them during training - pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train parser + with nlp.select_pipes(enable="parser"): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index dfb95b038..65acadb07 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -68,10 +68,7 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non ex = Example.from_gold(gold, doc=doc) train_examples.append(ex) - # get names of other pipes to disable them during training - pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"] - other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train textcat + with nlp.select_pipes(enable="textcat"): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5fa09da78..19e0a81e0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -145,7 +145,7 @@ def train( msg.text(f"Loading vectors from model '{vectors}'") _load_vectors(nlp, vectors) - nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) + nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: # first, create the model. # Bit of a hack after the refactor to get the vectors into a default config @@ -201,8 +201,8 @@ def train( exits=1, ) msg.text(f"Extending component from base model '{pipe}'") - disabled_pipes = nlp.disable_pipes( - [p for p in nlp.pipe_names if p not in pipeline] + disabled_pipes = nlp.select_pipes( + disable=[p for p in nlp.pipe_names if p not in pipeline] ) else: msg.text(f"Starting with blank model '{lang}'") diff --git a/spacy/errors.py b/spacy/errors.py index 99a0081c0..7a7b44731 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -104,6 +104,8 @@ class Warnings(object): "string \"Field1=Value1,Value2|Field2=Value3\".") # TODO: fix numbering after merging develop into master + W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' " + "instead.") W097 = ("No Model config was provided to create the '{name}' component, " "and no default configuration could be found either.") W098 = ("No Model config was provided to create the '{name}' component, " @@ -132,7 +134,7 @@ class Errors(object): E007 = ("'{name}' already exists in pipeline. Existing names: {opts}") E008 = ("Some current components would be lost when restoring previous " "pipeline state. If you added components after calling " - "`nlp.disable_pipes()`, you should remove them explicitly with " + "`nlp.select_pipes()`, you should remove them explicitly with " "`nlp.remove_pipe()` before the pipeline is restored. Names of " "the new components: {names}") E009 = ("The `update` method expects same number of docs and golds, but " @@ -546,6 +548,13 @@ class Errors(object): "token itself.") # TODO: fix numbering after merging develop into master + E991 = ("The function 'select_pipes' should be called with either a " + "'disable' argument to list the names of the pipe components " + "that should be disabled, or with an 'enable' argument that " + "specifies which pipes should not be disabled.") + E992 = ("The function `select_pipes` was called with `enable`={enable} " + "and `disable`={disable} but that information is conflicting " + "for the `nlp` pipeline with components {names}.") E993 = ("The config for 'nlp' should include either a key 'name' to " "refer to an existing model by name or path, or a key 'lang' " "to create a new blank model.") diff --git a/spacy/language.py b/spacy/language.py index a7db5ef20..5f617b1f6 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -511,11 +511,37 @@ class Language(object): of the block. Otherwise, a DisabledPipes object is returned, that has a `.restore()` method you can use to undo your changes. - DOCS: https://spacy.io/api/language#disable_pipes + This method has been deprecated since 3.0 """ + warnings.warn(Warnings.W096, DeprecationWarning) if len(names) == 1 and isinstance(names[0], (list, tuple)): names = names[0] # support list of names instead of spread - return DisabledPipes(self, *names) + return DisabledPipes(self, names) + + def select_pipes(self, disable=None, enable=None): + """Disable one or more pipeline components. If used as a context + manager, the pipeline will be restored to the initial state at the end + of the block. Otherwise, a DisabledPipes object is returned, that has + a `.restore()` method you can use to undo your changes. + + disable (str or iterable): The name(s) of the pipes to disable + enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled + + DOCS: https://spacy.io/api/language#select_pipes + """ + if enable is None and disable is None: + raise ValueError(Errors.E991) + if disable is not None and isinstance(disable, str): + disable = [disable] + if enable is not None: + if isinstance(enable, str): + enable = [enable] + to_disable = [pipe for pipe in self.pipe_names if pipe not in enable] + # raise an error if the enable and disable keywords are not consistent + if disable is not None and disable != to_disable: + raise ValueError(Errors.E992.format(enable=enable, disable=disable, names=self.pipe_names)) + disable = to_disable + return DisabledPipes(self, disable) def make_doc(self, text): return self.tokenizer(text) @@ -1117,7 +1143,7 @@ def _fix_pretrained_vectors_name(nlp): class DisabledPipes(list): """Manager for temporary pipeline disabling.""" - def __init__(self, nlp, *names): + def __init__(self, nlp, names): self.nlp = nlp self.names = names # Important! Not deep copy -- we just want the container (but we also diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 06c568ac9..58160c2e9 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -200,7 +200,7 @@ class EntityRuler(object): ] except ValueError: subsequent_pipes = [] - with self.nlp.disable_pipes(subsequent_pipes): + with self.nlp.select_pipes(disable=subsequent_pipes): token_patterns = [] phrase_pattern_labels = [] phrase_pattern_texts = [] diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index e2fb02a2a..d42216655 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -88,7 +88,16 @@ def test_remove_pipe(nlp, name): def test_disable_pipes_method(nlp, name): nlp.add_pipe(new_pipe, name=name) assert nlp.has_pipe(name) - disabled = nlp.disable_pipes(name) + disabled = nlp.select_pipes(disable=name) + assert not nlp.has_pipe(name) + disabled.restore() + + +@pytest.mark.parametrize("name", ["my_component"]) +def test_enable_pipes_method(nlp, name): + nlp.add_pipe(new_pipe, name=name) + assert nlp.has_pipe(name) + disabled = nlp.select_pipes(enable=[]) assert not nlp.has_pipe(name) disabled.restore() @@ -97,19 +106,57 @@ def test_disable_pipes_method(nlp, name): def test_disable_pipes_context(nlp, name): nlp.add_pipe(new_pipe, name=name) assert nlp.has_pipe(name) - with nlp.disable_pipes(name): + with nlp.select_pipes(disable=name): assert not nlp.has_pipe(name) assert nlp.has_pipe(name) -def test_disable_pipes_list_arg(nlp): +def test_select_pipes_list_arg(nlp): for name in ["c1", "c2", "c3"]: nlp.add_pipe(new_pipe, name=name) assert nlp.has_pipe(name) - with nlp.disable_pipes(["c1", "c2"]): + with nlp.select_pipes(disable=["c1", "c2"]): assert not nlp.has_pipe("c1") assert not nlp.has_pipe("c2") assert nlp.has_pipe("c3") + with nlp.select_pipes(enable="c3"): + assert not nlp.has_pipe("c1") + assert not nlp.has_pipe("c2") + assert nlp.has_pipe("c3") + with nlp.select_pipes(enable=["c1", "c2"], disable="c3"): + assert nlp.has_pipe("c1") + assert nlp.has_pipe("c2") + assert not nlp.has_pipe("c3") + with nlp.select_pipes(enable=[]): + assert not nlp.has_pipe("c1") + assert not nlp.has_pipe("c2") + assert not nlp.has_pipe("c3") + with nlp.select_pipes(enable=["c1", "c2", "c3"], disable=[]): + assert nlp.has_pipe("c1") + assert nlp.has_pipe("c2") + assert nlp.has_pipe("c3") + with nlp.select_pipes(disable=["c1", "c2", "c3"], enable=[]): + assert not nlp.has_pipe("c1") + assert not nlp.has_pipe("c2") + assert not nlp.has_pipe("c3") + + +def test_select_pipes_errors(nlp): + for name in ["c1", "c2", "c3"]: + nlp.add_pipe(new_pipe, name=name) + assert nlp.has_pipe(name) + + with pytest.raises(ValueError): + nlp.select_pipes() + + with pytest.raises(ValueError): + nlp.select_pipes(enable=["c1", "c2"], disable=["c1"]) + + with pytest.raises(ValueError): + nlp.select_pipes(enable=["c1", "c2"], disable=[]) + + with pytest.raises(ValueError): + nlp.select_pipes(enable=[], disable=["c3"]) @pytest.mark.parametrize("n_pipes", [100]) diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index 120cea1d2..cab68793c 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -31,7 +31,7 @@ def test_issue3611(): nlp.add_pipe(textcat, last=True) # training the network - with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]): + with nlp.select_pipes(enable="textcat"): optimizer = nlp.begin_training(X=x_train, Y=y_train) for i in range(3): losses = {} diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index 7158d9b21..b641213ad 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -31,7 +31,7 @@ def test_issue4030(): nlp.add_pipe(textcat, last=True) # training the network - with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]): + with nlp.select_pipes(enable="textcat"): optimizer = nlp.begin_training() for i in range(3): losses = {} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d548a1f64..703a0f678 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -314,45 +314,47 @@ component function. | `name` | unicode | Name of the component to remove. | | **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | -## Language.disable_pipes {#disable_pipes tag="contextmanager, method" new="2"} +## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} Disable one or more pipeline components. If used as a context manager, the pipeline will be restored to the initial state at the end of the block. Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method you can use to undo your changes. +You can specify either `disable` (as a list or string), or `enable`. In the +latter case, all components not in the `enable` list, will be disabled. + > #### Example > > ```python -> # New API as of v2.2.2 -> with nlp.disable_pipes(["tagger", "parser"]): +> # New API as of v3.0 +> with nlp.select_pipes(disable=["tagger", "parser"]): > nlp.begin_training() > -> with nlp.disable_pipes("tagger", "parser"): +> with nlp.select_pipes(enable="ner"): > nlp.begin_training() > -> disabled = nlp.disable_pipes("tagger", "parser") +> disabled = nlp.select_pipes(disable=["tagger", "parser"]) > nlp.begin_training() > disabled.restore() > ``` -| Name | Type | Description | -| ----------------------------------------- | --------------- | ------------------------------------------------------------------------------------ | -| `disabled` 2.2.2 | list | Names of pipeline components to disable. | -| `*disabled` | unicode | Names of pipeline components to disable. | -| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | +| Name | Type | Description | +| ----------- | --------------- | ------------------------------------------------------------------------------------ | +| `disable` | list | Names of pipeline components to disable. | +| `disable` | unicode | Name of pipeline component to disable. | +| `enable` | list | Names of pipeline components that will not be disabled. | +| `enable` | unicode | Name of pipeline component that will not be disabled. | +| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | - -As of spaCy v2.2.2, the `Language.disable_pipes` method can also take a list of -component names as its first argument (instead of a variable number of -arguments). This is especially useful if you're generating the component names -to disable programmatically. The new syntax will become the default in the -future. + + +As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: ```diff -- disabled = nlp.disable_pipes("tagger", "parser") -+ disabled = nlp.disable_pipes(["tagger", "parser"]) +- nlp.disable_pipes(["tagger", "parser"]) ++ nlp.select_pipes(disable=["tagger", "parser"]) ``` diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 7382f2b8c..696e11106 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -252,9 +252,9 @@ for doc in nlp.pipe(texts, disable=["tagger", "parser"]): If you need to **execute more code** with components disabled – e.g. to reset the weights or update only some components during training – you can use the -[`nlp.disable_pipes`](/api/language#disable_pipes) contextmanager. At the end of +[`nlp.select_pipes`](/api/language#select_pipes) contextmanager. At the end of the `with` block, the disabled pipeline components will be restored -automatically. Alternatively, `disable_pipes` returns an object that lets you +automatically. Alternatively, `select_pipes` returns an object that lets you call its `restore()` method to restore the disabled components when needed. This can be useful if you want to prevent unnecessary code indentation of large blocks. @@ -262,16 +262,26 @@ blocks. ```python ### Disable for block # 1. Use as a contextmanager -with nlp.disable_pipes("tagger", "parser"): +with nlp.select_pipes(disable=["tagger", "parser"]): doc = nlp("I won't be tagged and parsed") doc = nlp("I will be tagged and parsed") # 2. Restore manually -disabled = nlp.disable_pipes("ner") +disabled = nlp.select_pipes(disable="ner") doc = nlp("I won't have named entities") disabled.restore() ``` +If you want to disable all pipes except for one or a few, you can use the `enable` +keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string +defining just one pipe. +```python +# Enable only the parser +with nlp.select_pipes(enable="parser"): + doc = nlp("I will only be parsed") +``` + + Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method to remove pipeline components from an existing pipeline, the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 1db2405d1..5f47bd2e3 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -906,7 +906,7 @@ pipeline component, **make sure that the pipeline component runs** when you create the pattern. For example, to match on `POS` or `LEMMA`, the pattern `Doc` objects need to have part-of-speech tags set by the `tagger`. You can either call the `nlp` object on your pattern texts instead of `nlp.make_doc`, or use -[`nlp.disable_pipes`](/api/language#disable_pipes) to disable components +[`nlp.select_pipes`](/api/language#select_pipes) to disable components selectively. @@ -1121,8 +1121,7 @@ while adding the phrase patterns. entityruler = EntityRuler(nlp) patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)] -other_pipes = [p for p in nlp.pipe_names if p != "tagger"] -with nlp.disable_pipes(*other_pipes): +with nlp.select_pipes(enable="tagger"): entityruler.add_patterns(patterns) ``` diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 479bdd264..39d732724 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -647,8 +647,7 @@ import random nlp = spacy.load("en_core_web_sm") train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] -other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] -with nlp.disable_pipes(*other_pipes): +with nlp.select_pipes(enable="ner"): optimizer = nlp.begin_training() for i in range(10): random.shuffle(train_data) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 479441edf..a10c60357 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -362,7 +362,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py you're using a blank model, don't forget to add the entity recognizer to the pipeline. If you're using an existing model, make sure to disable all other pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the entity recognizer. 2. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through @@ -403,7 +403,7 @@ referred to as the "catastrophic forgetting" problem. you're using a blank model, don't forget to add the entity recognizer to the pipeline. If you're using an existing model, make sure to disable all other pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the entity recognizer. 2. **Add the new entity label** to the entity recognizer using the [`add_label`](/api/entityrecognizer#add_label) method. You can access the @@ -436,7 +436,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py you're using a blank model, don't forget to add the parser to the pipeline. If you're using an existing model, make sure to disable all other pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the parser. 2. **Add the dependency labels** to the parser using the [`add_label`](/api/dependencyparser#add_label) method. If you're starting off @@ -470,7 +470,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_tagger.py you're using a blank model, don't forget to add the tagger to the pipeline. If you're using an existing model, make sure to disable all other pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the tagger. 2. **Add the tag map** to the tagger using the [`add_label`](/api/tagger#add_label) method. The first argument is the new @@ -544,7 +544,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_intent_pa you're using a blank model, don't forget to add the custom parser to the pipeline. If you're using an existing model, make sure to **remove the old parser** from the pipeline, and disable all other pipeline components during - training using [`nlp.disable_pipes`](/api/language#disable_pipes). This way, + training using [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the parser. 3. **Add the dependency labels** to the parser using the [`add_label`](/api/dependencyparser#add_label) method. @@ -576,7 +576,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.p [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If you're using an existing model, make sure to disable all other pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the text classifier. 2. **Add the text classifier** to the pipeline, and add the labels you want to train – for example, `POSITIVE`. @@ -653,7 +653,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_li pipeline including also a component for [named entity recognition](/usage/training#ner). If you're using a model with additional components, make sure to disable all other pipeline components - during training using [`nlp.disable_pipes`](/api/language#disable_pipes). + during training using [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be training the entity linker. 2. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through From f00de445dd04b61bc55a0fe010c9cd3862d38aef Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 19 May 2020 16:20:03 +0200 Subject: [PATCH 111/187] default models defined in component decorator (#5452) * move defaults to pipeline and use in component decorator * black formatting * relative import --- spacy/language.py | 54 +++++++------------ .../models => pipeline}/defaults/__init__.py | 2 +- .../defaults/entity_linker_defaults.cfg | 0 .../defaults/morphologizer_defaults.cfg | 0 .../defaults/ner_defaults.cfg | 0 .../defaults/parser_defaults.cfg | 0 .../defaults/senter_defaults.cfg | 0 .../defaults/simple_ner_defaults.cfg | 0 .../defaults/tagger_defaults.cfg | 0 .../defaults/tensorizer_defaults.cfg | 0 .../defaults/textcat_bow_defaults.cfg | 0 .../defaults/textcat_cnn_defaults.cfg | 0 .../defaults/textcat_defaults.cfg | 0 .../defaults/tok2vec_defaults.cfg | 0 spacy/pipeline/morphologizer.pyx | 3 +- spacy/pipeline/pipes.pyx | 29 +++++++--- spacy/pipeline/simple_ner.py | 4 +- spacy/pipeline/tok2vec.py | 3 +- spacy/tests/doc/test_add_entities.py | 2 +- spacy/tests/parser/test_add_label.py | 2 +- spacy/tests/parser/test_arc_eager_oracle.py | 2 +- spacy/tests/parser/test_ner.py | 2 +- spacy/tests/parser/test_neural_parser.py | 2 +- spacy/tests/parser/test_nn_beam.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 2 +- spacy/tests/pipeline/test_textcat.py | 2 +- spacy/tests/regression/test_issue1501-2000.py | 2 +- spacy/tests/regression/test_issue3001-3500.py | 2 +- spacy/tests/regression/test_issue3830.py | 2 +- spacy/tests/regression/test_issue4042.py | 2 +- spacy/tests/regression/test_issue4313.py | 2 +- .../serialize/test_serialize_pipeline.py | 4 +- 32 files changed, 64 insertions(+), 61 deletions(-) rename spacy/{ml/models => pipeline}/defaults/__init__.py (99%) rename spacy/{ml/models => pipeline}/defaults/entity_linker_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/morphologizer_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/ner_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/parser_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/senter_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/simple_ner_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/tagger_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/tensorizer_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/textcat_bow_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/textcat_cnn_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/textcat_defaults.cfg (100%) rename spacy/{ml/models => pipeline}/defaults/tok2vec_defaults.cfg (100%) diff --git a/spacy/language.py b/spacy/language.py index 5f617b1f6..2b8fa129e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -184,33 +184,6 @@ class Language(object): self.max_length = max_length self._optimizer = None - # TODO: de-uglify (incorporating into component decorator didn't work because of circular imports) - from .ml.models.defaults import ( - default_tagger_config, - default_parser_config, - default_ner_config, - default_textcat_config, - default_nel_config, - default_morphologizer_config, - default_senter_config, - default_tensorizer_config, - default_tok2vec_config, - default_simple_ner_config - ) - - self.defaults = { - "tagger": default_tagger_config(), - "parser": default_parser_config(), - "ner": default_ner_config(), - "textcat": default_textcat_config(), - "entity_linker": default_nel_config(), - "morphologizer": default_morphologizer_config(), - "senter": default_senter_config(), - "simple_ner": default_simple_ner_config(), - "tensorizer": default_tensorizer_config(), - "tok2vec": default_tok2vec_config(), - } - @property def path(self): return self._path @@ -338,7 +311,6 @@ class Language(object): else: raise KeyError(Errors.E002.format(name=name)) factory = self.factories[name] - default_config = self.defaults.get(name, None) # transform the model's config to an actual Model factory_cfg = dict(config) @@ -349,11 +321,6 @@ class Language(object): warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) model_cfg = None del factory_cfg["model"] - if model_cfg is None and default_config is not None: - warnings.warn(Warnings.W098.format(name=name)) - model_cfg = default_config["model"] - if model_cfg is None: - warnings.warn(Warnings.W097.format(name=name)) model = None if model_cfg is not None: self.config[name] = {"model": model_cfg} @@ -539,7 +506,11 @@ class Language(object): to_disable = [pipe for pipe in self.pipe_names if pipe not in enable] # raise an error if the enable and disable keywords are not consistent if disable is not None and disable != to_disable: - raise ValueError(Errors.E992.format(enable=enable, disable=disable, names=self.pipe_names)) + raise ValueError( + Errors.E992.format( + enable=enable, disable=disable, names=self.pipe_names + ) + ) disable = to_disable return DisabledPipes(self, disable) @@ -1085,7 +1056,14 @@ class component(object): # NB: This decorator needs to live here, because it needs to write to # Language.factories. All other solutions would cause circular import. - def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False): + def __init__( + self, + name=None, + assigns=tuple(), + requires=tuple(), + retokenizes=False, + default_model=lambda: None, + ): """Decorate a pipeline component. name (unicode): Default component and factory name. @@ -1097,6 +1075,7 @@ class component(object): self.assigns = validate_attrs(assigns) self.requires = validate_attrs(requires) self.retokenizes = retokenizes + self.default_model = default_model def __call__(self, *args, **kwargs): obj = args[0] @@ -1109,6 +1088,11 @@ class component(object): obj.retokenizes = self.retokenizes def factory(nlp, model, **cfg): + if model is None: + model = self.default_model() + warnings.warn(Warnings.W098.format(name=self.name)) + if model is None: + warnings.warn(Warnings.W097.format(name=self.name)) if hasattr(obj, "from_nlp"): return obj.from_nlp(nlp, model, **cfg) elif isinstance(obj, type): diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py similarity index 99% rename from spacy/ml/models/defaults/__init__.py rename to spacy/pipeline/defaults/__init__.py index 850d9fce0..e17e2d3b4 100644 --- a/spacy/ml/models/defaults/__init__.py +++ b/spacy/pipeline/defaults/__init__.py @@ -1,6 +1,6 @@ from pathlib import Path -from .... import util +from ... import util def default_nel_config(): diff --git a/spacy/ml/models/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/entity_linker_defaults.cfg rename to spacy/pipeline/defaults/entity_linker_defaults.cfg diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/morphologizer_defaults.cfg rename to spacy/pipeline/defaults/morphologizer_defaults.cfg diff --git a/spacy/ml/models/defaults/ner_defaults.cfg b/spacy/pipeline/defaults/ner_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/ner_defaults.cfg rename to spacy/pipeline/defaults/ner_defaults.cfg diff --git a/spacy/ml/models/defaults/parser_defaults.cfg b/spacy/pipeline/defaults/parser_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/parser_defaults.cfg rename to spacy/pipeline/defaults/parser_defaults.cfg diff --git a/spacy/ml/models/defaults/senter_defaults.cfg b/spacy/pipeline/defaults/senter_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/senter_defaults.cfg rename to spacy/pipeline/defaults/senter_defaults.cfg diff --git a/spacy/ml/models/defaults/simple_ner_defaults.cfg b/spacy/pipeline/defaults/simple_ner_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/simple_ner_defaults.cfg rename to spacy/pipeline/defaults/simple_ner_defaults.cfg diff --git a/spacy/ml/models/defaults/tagger_defaults.cfg b/spacy/pipeline/defaults/tagger_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/tagger_defaults.cfg rename to spacy/pipeline/defaults/tagger_defaults.cfg diff --git a/spacy/ml/models/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/tensorizer_defaults.cfg rename to spacy/pipeline/defaults/tensorizer_defaults.cfg diff --git a/spacy/ml/models/defaults/textcat_bow_defaults.cfg b/spacy/pipeline/defaults/textcat_bow_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/textcat_bow_defaults.cfg rename to spacy/pipeline/defaults/textcat_bow_defaults.cfg diff --git a/spacy/ml/models/defaults/textcat_cnn_defaults.cfg b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/textcat_cnn_defaults.cfg rename to spacy/pipeline/defaults/textcat_cnn_defaults.cfg diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/textcat_defaults.cfg rename to spacy/pipeline/defaults/textcat_defaults.cfg diff --git a/spacy/ml/models/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg similarity index 100% rename from spacy/ml/models/defaults/tok2vec_defaults.cfg rename to spacy/pipeline/defaults/tok2vec_defaults.cfg diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 7a2bc3b17..c45a72b25 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -17,9 +17,10 @@ from ..util import link_vectors_to_models, create_default_optimizer from ..errors import Errors, TempErrors from .pipes import Tagger, _load_cfg from .. import util +from .defaults import default_morphologizer -@component("morphologizer", assigns=["token.morph", "token.pos"]) +@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer) class Morphologizer(Tagger): def __init__(self, vocab, model, **cfg): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 61db11baa..4ff956e1d 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -2,6 +2,7 @@ import numpy import srsly import random + from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy import warnings @@ -13,6 +14,8 @@ from ..syntax.arc_eager cimport ArcEager from ..morphology cimport Morphology from ..vocab cimport Vocab +from .defaults import default_tagger, default_parser, default_ner, default_textcat +from .defaults import default_nel, default_senter, default_tensorizer from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj @@ -234,7 +237,7 @@ class Pipe(object): return self -@component("tensorizer", assigns=["doc.tensor"]) +@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer) class Tensorizer(Pipe): """Pre-train position-sensitive vectors for tokens.""" @@ -366,7 +369,7 @@ class Tensorizer(Pipe): return sgd -@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"]) +@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. @@ -636,7 +639,7 @@ class Tagger(Pipe): return self -@component("senter", assigns=["token.is_sent_start"]) +@component("senter", assigns=["token.is_sent_start"], default_model=default_senter) class SentenceRecognizer(Tagger): """Pipeline component for sentence segmentation. @@ -976,7 +979,7 @@ class ClozeMultitask(Pipe): losses[self.name] += loss -@component("textcat", assigns=["doc.cats"]) +@component("textcat", assigns=["doc.cats"], default_model=default_textcat) class TextCategorizer(Pipe): """Pipeline component for text classification. @@ -1227,7 +1230,8 @@ cdef class EntityRecognizer(Parser): @component( "entity_linker", requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], - assigns=["token.ent_kb_id"] + assigns=["token.ent_kb_id"], + default_model=default_nel, ) class EntityLinker(Pipe): """Pipeline component for named entity linking. @@ -1673,8 +1677,19 @@ class Sentencizer(Pipe): # Cython classes can't be decorated, so we need to add the factories here -Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg) -Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg) +Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, model, **cfg) +Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg) +def parser_factory(nlp, model, **cfg): + if model is None: + model = default_parser() + warnings.warn(Warnings.W098.format(name="parser")) + return DependencyParser.from_nlp(nlp, model, **cfg) + +def ner_factory(nlp, model, **cfg): + if model is None: + model = default_ner() + warnings.warn(Warnings.W098.format(name="ner")) + return EntityRecognizer.from_nlp(nlp, model, **cfg) __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 8d53152d8..c674046af 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -2,6 +2,8 @@ from typing import List from thinc.types import Floats2d from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate from thinc.util import to_numpy + +from .defaults import default_simple_ner from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob from ..tokens import Doc from ..language import component @@ -9,7 +11,7 @@ from ..util import link_vectors_to_models from .pipes import Pipe -@component("simple_ner", assigns=["doc.ents"]) +@component("simple_ner", assigns=["doc.ents"], default_model=default_simple_ner) class SimpleNER(Pipe): """Named entity recognition with a tagging model. The model should include validity constraints to ensure that only valid tag sequences are returned.""" diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 83a4454e3..5882fa266 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -6,9 +6,10 @@ from ..tokens import Doc from ..vocab import Vocab from ..language import component from ..util import link_vectors_to_models, minibatch, eg2doc +from .defaults import default_tok2vec -@component("tok2vec", assigns=["doc.tensor"]) +@component("tok2vec", assigns=["doc.tensor"], default_model=default_tok2vec) class Tok2Vec(Pipe): @classmethod def from_nlp(cls, nlp, model, **cfg): diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 3a466b24c..c92fc1ff9 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -3,7 +3,7 @@ from spacy.tokens import Span import pytest from ..util import get_doc -from ...ml.models.defaults import default_ner +from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 39682ba3d..ee1bba886 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -4,7 +4,7 @@ from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab -from spacy.ml.models.defaults import default_parser, default_ner +from spacy.pipeline.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.util import fix_random_seed diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 2426805d2..30b4a6f6d 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,7 +1,7 @@ import pytest from spacy.vocab import Vocab -from spacy.ml.models.defaults import default_parser +from spacy.pipeline.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.tokens import Doc from spacy.gold import GoldParse diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 2fd87ead3..9656d3a10 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -2,7 +2,7 @@ import pytest from spacy import util from spacy.lang.en import English -from spacy.ml.models.defaults import default_ner +from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index c985cf87a..b648e9a00 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,5 +1,5 @@ import pytest -from spacy.ml.models.defaults import default_parser, default_tok2vec +from spacy.pipeline.defaults import default_parser, default_tok2vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 619e0cc0b..db9eb5e6f 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -2,7 +2,7 @@ import pytest import numpy from spacy.vocab import Vocab from spacy.language import Language -from spacy.ml.models.defaults import default_parser +from spacy.pipeline.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.syntax.arc_eager import ArcEager from spacy.tokens import Doc diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index af777aa6b..dc13fcdf1 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -4,7 +4,7 @@ from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab -from spacy.ml.models.defaults import default_parser +from spacy.pipeline.defaults import default_parser from spacy.tokens import Doc from spacy.pipeline import DependencyParser diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index b091ec0de..725a4fd69 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -11,7 +11,7 @@ from spacy.gold import GoldParse from spacy.util import fix_random_seed from ..util import make_tempdir -from ...ml.models.defaults import default_tok2vec +from spacy.pipeline.defaults import default_tok2vec TRAIN_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 5f5f0c9eb..5a76697bc 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -10,7 +10,7 @@ from spacy.lang.lex_attrs import is_stop from spacy.vectors import Vectors from spacy.vocab import Vocab from spacy.language import Language -from spacy.ml.models.defaults import default_ner, default_tagger +from spacy.pipeline.defaults import default_ner, default_tagger from spacy.tokens import Doc, Span, Token from spacy.pipeline import Tagger, EntityRecognizer from spacy.attrs import HEAD, DEP diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 06ba6c4ac..240163d6e 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -1,7 +1,7 @@ import pytest from spacy.lang.en import English from spacy.lang.de import German -from spacy.ml.models.defaults import default_ner +from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRuler, EntityRecognizer from spacy.matcher import Matcher, PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 9752f70df..3d8e80847 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -1,7 +1,7 @@ from spacy.pipeline.pipes import DependencyParser from spacy.vocab import Vocab -from spacy.ml.models.defaults import default_parser +from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 75a1c23b7..30081543b 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -3,7 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English from spacy.tokens import Span from spacy.util import ensure_path -from spacy.ml.models.defaults import default_ner +from spacy.pipeline.defaults import default_ner from ..util import make_tempdir diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 30688601f..ba4d2deab 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,6 +1,6 @@ from collections import defaultdict -from spacy.ml.models.defaults import default_ner +from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer from spacy.lang.en import English diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 475181c7b..4fc277c4f 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,8 +1,8 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer -from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger -from spacy.ml.models.defaults import default_textcat, default_senter +from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger +from spacy.pipeline.defaults import default_textcat, default_senter from ..util import make_tempdir From a2830c3ef52167e7e99cb44d5ebd21a75e461146 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 May 2020 16:23:11 +0200 Subject: [PATCH 112/187] Use thinc 8.0.0a9 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 08b4c228a..e5f1ae10b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a8 +thinc==8.0.0a9 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 9fe02018b..df1658fd0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a8 + thinc==8.0.0a9 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 From 664a3603b0313b650b3b43e2897f381f1e3598df Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 May 2020 17:15:39 +0200 Subject: [PATCH 113/187] Set version to v3.0.0.dev8 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 3f87c8dbc..3af1b77a0 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev7" +__version__ = "3.0.0.dev8" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 7f5715a08159c06c249c3efe4d8934df2c98544d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 20 May 2020 11:41:12 +0200 Subject: [PATCH 114/187] Various fixes to NEL functionality, Example class etc (#5460) * setting KB in the EL constructor, similar to how the model is passed on * removing wikipedia example files - moved to projects * throw an error when nlp.update is called with 2 positional arguments * rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config * update config files with new parameters * avoid training pipeline components that don't have a model (like sentencizer) * various small fixes + UX improvements * small fixes * set thinc to 8.0.0a9 everywhere * remove outdated comment --- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 ++ .../ptb-joint-pos-dep/defaults.cfg | 2 ++ .../tok2vec-ner/charembed_tok2vec.cfg | 2 ++ .../tok2vec-ner/multihashembed_tok2vec.cfg | 2 ++ examples/training/train_entity_linker.py | 8 ++--- pyproject.toml | 2 +- setup.cfg | 2 +- spacy/cli/train_from_config.py | 14 ++++++--- spacy/errors.py | 21 ++++++++++--- spacy/gold.pyx | 25 ++++++++++++--- spacy/language.py | 31 +++++++++++-------- spacy/ml/models/entity_linker.py | 12 +++++++ spacy/pipeline/pipes.pyx | 22 ++++++++----- spacy/tests/pipeline/test_entity_linker.py | 7 ++--- spacy/util.py | 2 ++ 15 files changed, 108 insertions(+), 46 deletions(-) diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index b6b4e82b6..e152fa5e0 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -12,6 +12,8 @@ use_gpu = 0 scores = ["tags_acc", "uas", "las"] score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 +seed = 0 +accumulate_gradient = 2 [training.batch_size] @schedules = "compounding.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 2ceaab0be..9a10c45f0 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -12,6 +12,8 @@ use_gpu = -1 scores = ["tags_acc", "uas", "las"] score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 +seed = 0 +accumulate_gradient = 2 [training.batch_size] @schedules = "compounding.v1" diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index b8219ad10..796c8670f 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -10,6 +10,8 @@ orth_variant_level = 0.0 gold_preproc = true max_length = 0 batch_size = 25 +seed = 0 +accumulate_gradient = 2 [optimizer] @optimizers = "Adam.v1" diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index dc25a1c3b..3ac70675b 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -9,6 +9,8 @@ score_weights = {"ents_f": 1} orth_variant_level = 0.0 gold_preproc = true max_length = 0 +seed = 0 +accumulate_gradient = 2 [training.batch_size] @schedules = "compounding.v1" diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index a22f255e7..2da1db26d 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: - # use only the predicted EL score and not the prior probability (for demo purposes) - cfg = {"incl_prior": False} - entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) - entity_linker.set_kb(kb) + + # use only the predicted EL score and not the prior probability (for demo purposes) + cfg = {"kb": kb, "incl_prior": False} + entity_linker = nlp.create_pipe("entity_linker", cfg) nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. diff --git a/pyproject.toml b/pyproject.toml index 548664e89..66a06c1d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a8", + "thinc==8.0.0a9", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index df1658fd0..1cd088279 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a8 + thinc==8.0.0a9 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index bd83deb04..96c5b676e 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -12,7 +12,7 @@ import random from ..gold import GoldCorpus from .. import util - +from ..errors import Errors registry = util.registry @@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg): max_length=cfg["max_length"], ignore_misaligned=True, )) + if len(train_examples) == 0: + raise ValueError(Errors.E988) random.shuffle(train_examples) batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) for batch in batches: @@ -313,12 +315,14 @@ def train_while_improving( dropouts = dropout results = [] losses = {} + to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")] + for step, batch in enumerate(train_data): dropout = next(dropouts) - for subbatch in subdivide_batch(batch, accumulate_gradient): - nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) - for name, proc in nlp.pipeline: - if hasattr(proc, "model"): + with nlp.select_pipes(enable=to_enable): + for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) + for name, proc in nlp.pipeline: proc.model.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): diff --git a/spacy/errors.py b/spacy/errors.py index 7a7b44731..4d38ab586 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -195,7 +195,7 @@ class Errors(object): "the documentation:\nhttps://spacy.io/usage/models") E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " "component to the pipeline with: " - "nlp.add_pipe(nlp.create_pipe('sentencizer')) " + "nlp.add_pipe(nlp.create_pipe('sentencizer')). " "Alternatively, add the dependency parser, or set sentence " "boundaries by setting doc[i].is_sent_start.") E031 = ("Invalid token: empty string ('') at position {i}.") @@ -430,8 +430,7 @@ class Errors(object): E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input " "includes either the `text` or `tokens` key. For more info, see " "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl") - E139 = ("Knowledge Base for component '{name}' not initialized. Did you " - "forget to call set_kb()?") + E139 = ("Knowledge Base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -548,6 +547,18 @@ class Errors(object): "token itself.") # TODO: fix numbering after merging develop into master + + E987 = ("The text of an example training instance is either a Doc or " + "a string, but found {type} instead.") + E988 = ("Could not parse any training examples. Ensure the data is " + "formatted correctly.") + E989 = ("'nlp.update()' was called with two positional arguments. This " + "may be due to a backwards-incompatible change to the format " + "of the training data in spaCy 3.0 onwards. The 'update' " + "function should now be called with a batch of 'Example' " + "objects, instead of (text, annotation) tuples. ") + E990 = ("An entity linking component needs to be initialized with a " + "KnowledgeBase object, but found {type} instead.") E991 = ("The function 'select_pipes' should be called with either a " "'disable' argument to list the names of the pipe components " "that should be disabled, or with an 'enable' argument that " @@ -562,8 +573,8 @@ class Errors(object): E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - E998 = ("Can only create GoldParse objects from Example objects without a " - "Doc if get_gold_parses() is called with a Vocab object.") + E998 = ("To create GoldParse objects from Example objects without a " + "Doc, get_gold_parses() should be called with a Vocab object.") E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 6647e41b4..46a6ae583 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -212,6 +212,8 @@ class GoldCorpus(object): doc = ex_dict.get("doc", None) if doc is None: doc = ex_dict.get("text", None) + if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)): + raise ValueError(Errors.E987.format(type=type(doc))) examples.append(Example.from_dict(ex_dict, doc=doc)) elif file_name.endswith("msg"): @@ -288,7 +290,6 @@ class GoldCorpus(object): """ Setting gold_preproc will result in creating a doc per sentence """ for example in examples: if gold_preproc: - example.doc = None split_examples = example.split_sents() example_golds = [] for split_example in split_examples: @@ -716,6 +717,12 @@ cdef class TokenAnnotation: def get_sent_start(self, i): return self.sent_starts[i] if i < len(self.sent_starts) else None + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return self.__str__() + cdef class DocAnnotation: def __init__(self, cats=None, links=None): @@ -729,6 +736,12 @@ cdef class DocAnnotation: def to_dict(self): return {"cats": self.cats, "links": self.links} + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return self.__str__() + cdef class Example: def __init__(self, doc_annotation=None, token_annotation=None, doc=None, @@ -747,9 +760,9 @@ cdef class Example: @classmethod def from_dict(cls, example_dict, doc=None): - token_dict = example_dict["token_annotation"] + token_dict = example_dict.get("token_annotation", {}) token_annotation = TokenAnnotation.from_dict(token_dict) - doc_dict = example_dict["doc_annotation"] + doc_dict = example_dict.get("doc_annotation", {}) doc_annotation = DocAnnotation.from_dict(doc_dict) return cls(doc_annotation, token_annotation, doc) @@ -791,6 +804,8 @@ cdef class Example: def split_sents(self): """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" + if not self.token_annotation.words: + return [self] s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] @@ -842,7 +857,7 @@ cdef class Example: if merge: t = self.token_annotation doc = self.doc - if self.doc is None: + if doc is None or not isinstance(doc, Doc): if not vocab: raise ValueError(Errors.E998) doc = Doc(vocab, words=t.words) @@ -1052,7 +1067,7 @@ cdef class GoldParse: self.sent_starts = [None] * len(doc) # This needs to be done before we align the words - if make_projective and heads is not None and deps is not None: + if make_projective and any(heads) and any(deps) : heads, deps = nonproj.projectivize(heads, deps) # Do many-to-one alignment for misaligned tokens. diff --git a/spacy/language.py b/spacy/language.py index 2b8fa129e..d71c27406 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -314,19 +314,20 @@ class Language(object): # transform the model's config to an actual Model factory_cfg = dict(config) - model_cfg = None + + # check whether we have a proper model config, or load a default one + if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): + warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)) + + # refer to the model configuration in the cfg settings for this component if "model" in factory_cfg: - model_cfg = factory_cfg["model"] - if not isinstance(model_cfg, dict): - warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) - model_cfg = None + self.config[name] = {"model": factory_cfg["model"]} + + # create all objects in the config + factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"] + model = factory_cfg.get("model", None) + if model is not None: del factory_cfg["model"] - model = None - if model_cfg is not None: - self.config[name] = {"model": model_cfg} - model = registry.make_from_config({"model": model_cfg}, validate=True)[ - "model" - ] return factory(self, model, **factory_cfg) def add_pipe( @@ -517,10 +518,11 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None): + def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None): """Update the models in the pipeline. examples (iterable): A batch of `Example` or `Doc` objects. + dummy: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (callable): An optimizer. losses (dict): Dictionary to update with the loss, keyed by component. @@ -529,6 +531,9 @@ class Language(object): DOCS: https://spacy.io/api/language#update """ + if dummy is not None: + raise ValueError(Errors.E989) + if len(examples) == 0: return examples = Example.to_example_objects(examples, make_doc=self.make_doc) @@ -735,7 +740,7 @@ class Language(object): contexts = [ pipe.use_params(params) for name, pipe in self.pipeline - if hasattr(pipe, "use_params") + if hasattr(pipe, "use_params") and hasattr(pipe, "model") ] # TODO: Having trouble with contextlib # Workaround: these aren't actually context managers atm. diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 9cbaba984..00689e85b 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,7 +1,11 @@ +from pathlib import Path + from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear from ...util import registry +from ...kb import KnowledgeBase +from ...vocab import Vocab @registry.architectures.register("spacy.EntityLinker.v1") @@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None): model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) return model + + +@registry.assets.register("spacy.KBFromFile.v1") +def load_kb(nlp_path, kb_path) -> KnowledgeBase: + vocab = Vocab().from_disk(Path(nlp_path) / "vocab") + kb = KnowledgeBase(vocab=vocab) + kb.load_bulk(kb_path) + return kb diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4ff956e1d..56fe54664 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -2,6 +2,7 @@ import numpy import srsly import random +from ast import literal_eval from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy @@ -1244,15 +1245,20 @@ class EntityLinker(Pipe): self.vocab = vocab self.model = model self.kb = None + self.kb = cfg.get("kb", None) + if self.kb is None: + # create an empty KB that should be filled by calling from_disk + self.kb = KnowledgeBase(vocab=vocab) + else: + del cfg["kb"] # we don't want to duplicate its serialization + if not isinstance(self.kb, KnowledgeBase): + raise ValueError(Errors.E990.format(type=type(self.kb))) self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) - def set_kb(self, kb): - self.kb = kb - def require_kb(self): # Raise an error if the knowledge base is not initialized. - if getattr(self, "kb", None) in (None, True, False): + if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): @@ -1285,6 +1291,8 @@ class EntityLinker(Pipe): ents_by_offset[(ent.start_char, ent.end_char)] = ent for entity, kb_dict in gold.links.items(): + if isinstance(entity, str): + entity = literal_eval(entity) start, end = entity mention = doc.text[start:end] @@ -1375,7 +1383,6 @@ class EntityLinker(Pipe): def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ self.require_kb() - entity_count = 0 final_kb_ids = [] final_tensors = [] @@ -1486,9 +1493,8 @@ class EntityLinker(Pipe): raise ValueError(Errors.E149) def load_kb(p): - kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) - kb.load_bulk(p) - self.set_kb(kb) + self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) + self.kb.load_bulk(p) deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index cdd8451fd..32b434e04 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) nlp.add_pipe(ruler) - el_pipe = nlp.create_pipe(name="entity_linker") - el_pipe.set_kb(mykb) + cfg = {"kb": mykb, "incl_prior": False} + el_pipe = nlp.create_pipe(name="entity_linker", config=cfg) el_pipe.begin_training() el_pipe.incl_context = False el_pipe.incl_prior = True @@ -288,8 +288,7 @@ def test_overfitting_IO(): mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.create_pipe("entity_linker") - entity_linker.set_kb(mykb) + entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) nlp.add_pipe(entity_linker, last=True) # train the NEL pipe diff --git a/spacy/util.py b/spacy/util.py index 048d923ee..f39813694 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -34,6 +34,7 @@ class registry(thinc.registry): lookups = catalogue.create("spacy", "lookups", entry_points=True) factories = catalogue.create("spacy", "factories", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) + assets = catalogue.create("spacy", "assets", entry_points=True) def set_env_log(value): @@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides): for name in pipeline: if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) + config.update(overrides) factory = factories.get(name, name) if nlp_config.get(name, None): model_config = nlp_config[name]["model"] From 24efd54a42e7e5f22b040018f222d24867e83a87 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 20 May 2020 12:27:31 +0200 Subject: [PATCH 115/187] Merge from develop --- spacy/cli/train_from_config.py | 5 ++++- spacy/syntax/_parser_model.pyx | 6 +++++- spacy/util.py | 13 +++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 96c5b676e..54eedf69e 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -224,8 +224,9 @@ def train_from_config( def create_train_batches(nlp, corpus, cfg): + is_first = True while True: - train_examples = list(corpus.train_dataset( + train_examples = corpus.train_dataset( nlp, noise_level=0.0, orth_variant_level=cfg["orth_variant_level"], @@ -323,6 +324,8 @@ def train_while_improving( for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) for name, proc in nlp.pipeline: + for name, proc in nlp.pipeline: + if hasattr(proc, "model"): proc.model.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 69f5bd6f6..60d22a1ab 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -474,7 +474,11 @@ cdef class precompute_hiddens: # This will usually be on GPU d_best = ops.asarray(d_best) # Fix nans (which can occur from unseen classes.) - d_best[ops.xp.isnan(d_best)] = 0. + try: + d_best[ops.xp.isnan(d_best)] = 0. + except: + print(ops.xp.isnan(d_best)) + raise if self.activation == "maxout": mask_ = ops.asarray(mask) return ops.backprop_maxout(d_best, mask_, self.nP) diff --git a/spacy/util.py b/spacy/util.py index f39813694..7f35c2f7c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -598,16 +598,9 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0 try: example = next(examples) except StopIteration: - if oversize: - examples = iter(oversize) - oversize = [] - if batch: - yield batch - break - else: - if batch: - yield batch - return + if batch: + yield batch + return n_words = count_words(example.doc) if n_words < (batch_size + tol_size): batch_size -= n_words From fda7355508cdb246f0ca12da1fd76b9c35cd8fa2 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 20 May 2020 12:30:21 +0200 Subject: [PATCH 116/187] Fix train-from-config --- spacy/cli/train_from_config.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 54eedf69e..429a3cf49 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -226,7 +226,7 @@ def train_from_config( def create_train_batches(nlp, corpus, cfg): is_first = True while True: - train_examples = corpus.train_dataset( + train_examples = list(corpus.train_dataset( nlp, noise_level=0.0, orth_variant_level=cfg["orth_variant_level"], @@ -324,9 +324,8 @@ def train_while_improving( for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) for name, proc in nlp.pipeline: - for name, proc in nlp.pipeline: - if hasattr(proc, "model"): - proc.model.finish_update(optimizer) + if hasattr(proc, "model"): + proc.model.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): score, other_scores = evaluate() From 60e8da481300da3540138d2689f73324a07b071b Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 20 May 2020 12:56:27 +0200 Subject: [PATCH 117/187] Tidy up train-from-config a bit --- spacy/cli/train_from_config.py | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 429a3cf49..c75c861cc 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -193,10 +193,11 @@ def train_from_config( optimizer, train_batches, evaluate, - training["dropout"], - training["patience"], - training["eval_frequency"], - training["accumulate_gradient"] + dropout=training["dropout"], + accumulate_gradient=training["accumulate_gradient"], + patience=training.get("patience", 0), + max_steps=training.get("max_steps", 0), + eval_frequency=training["eval_frequency"], ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") @@ -214,17 +215,17 @@ def train_from_config( progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) finally: if output_path is not None: - with nlp.use_params(optimizer.averages): - final_model_path = output_path / "model-final" + final_model_path = output_path / "model-final" + if optimizer.averages: + with nlp.use_params(optimizer.averages): + nlp.to_disk(final_model_path) + else: nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) - # with msg.loading("Creating best model..."): - # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) - # msg.good("Created best model", best_model_path) def create_train_batches(nlp, corpus, cfg): - is_first = True + epochs_todo = cfg.get("max_epochs", 0) while True: train_examples = list(corpus.train_dataset( nlp, @@ -240,6 +241,11 @@ def create_train_batches(nlp, corpus, cfg): batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) for batch in batches: yield batch + epochs_todo -= 1 + # We intentionally compare exactly to 0 here, so that max_epochs < 1 + # will not break. + if epochs_todo == 0: + break def create_evaluation_callback(nlp, optimizer, corpus, cfg): @@ -270,8 +276,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): def train_while_improving( - nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency, - accumulate_gradient + nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency, + accumulate_gradient=1, patience=0, max_steps=0 ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -281,6 +287,7 @@ def train_while_improving( Positional arguments: nlp: The spaCy pipeline to evaluate. + optimizer: The optimizer callable. train_data (Iterable[Batch]): A generator of batches, with the training data. Each batch should be a Sized[Tuple[Input, Annot]]. The training data iterable needs to take care of iterating over the epochs and @@ -344,9 +351,12 @@ def train_while_improving( yield batch, info, is_best_checkpoint if is_best_checkpoint is not None: losses = {} - # Stop if no improvement in `patience` updates + # Stop if no improvement in `patience` updates (if specified) best_score, best_step = max(results) - if (step - best_step) >= patience: + if patience and (step - best_step) >= patience: + break + # Stop if we've exhausted our max steps (if specified) + if max_steps and (step * accumulate_gradient) >= max_steps: break From 609c0ba557964f4b3111c4c253571b6d57377d18 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 May 2020 18:48:18 +0200 Subject: [PATCH 118/187] Fix accidentally quadratic runtime in Example.split_sents (#5464) * Tidy up train-from-config a bit * Fix accidentally quadratic perf in TokenAnnotation.brackets When we're reading in the gold data, we had a nested loop where we looped over the brackets for each token, looking for brackets that start on that word. This is accidentally quadratic, because we have one bracket per word (for the POS tags). So we had an O(N**2) behaviour here that ended up being pretty slow. To solve this I'm indexing the brackets by their starting word on the TokenAnnotations object, and having a property to provide the previous view. * Fixes --- spacy/cli/train_from_config.py | 38 +++++++++++++++++++++------------- spacy/gold.pxd | 2 +- spacy/gold.pyx | 28 +++++++++++++++++++------ 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 429a3cf49..c75c861cc 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -193,10 +193,11 @@ def train_from_config( optimizer, train_batches, evaluate, - training["dropout"], - training["patience"], - training["eval_frequency"], - training["accumulate_gradient"] + dropout=training["dropout"], + accumulate_gradient=training["accumulate_gradient"], + patience=training.get("patience", 0), + max_steps=training.get("max_steps", 0), + eval_frequency=training["eval_frequency"], ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") @@ -214,17 +215,17 @@ def train_from_config( progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) finally: if output_path is not None: - with nlp.use_params(optimizer.averages): - final_model_path = output_path / "model-final" + final_model_path = output_path / "model-final" + if optimizer.averages: + with nlp.use_params(optimizer.averages): + nlp.to_disk(final_model_path) + else: nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) - # with msg.loading("Creating best model..."): - # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) - # msg.good("Created best model", best_model_path) def create_train_batches(nlp, corpus, cfg): - is_first = True + epochs_todo = cfg.get("max_epochs", 0) while True: train_examples = list(corpus.train_dataset( nlp, @@ -240,6 +241,11 @@ def create_train_batches(nlp, corpus, cfg): batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) for batch in batches: yield batch + epochs_todo -= 1 + # We intentionally compare exactly to 0 here, so that max_epochs < 1 + # will not break. + if epochs_todo == 0: + break def create_evaluation_callback(nlp, optimizer, corpus, cfg): @@ -270,8 +276,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): def train_while_improving( - nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency, - accumulate_gradient + nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency, + accumulate_gradient=1, patience=0, max_steps=0 ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -281,6 +287,7 @@ def train_while_improving( Positional arguments: nlp: The spaCy pipeline to evaluate. + optimizer: The optimizer callable. train_data (Iterable[Batch]): A generator of batches, with the training data. Each batch should be a Sized[Tuple[Input, Annot]]. The training data iterable needs to take care of iterating over the epochs and @@ -344,9 +351,12 @@ def train_while_improving( yield batch, info, is_best_checkpoint if is_best_checkpoint is not None: losses = {} - # Stop if no improvement in `patience` updates + # Stop if no improvement in `patience` updates (if specified) best_score, best_step = max(results) - if (step - best_step) >= patience: + if patience and (step - best_step) >= patience: + break + # Stop if we've exhausted our max steps (if specified) + if max_steps and (step * accumulate_gradient) >= max_steps: break diff --git a/spacy/gold.pxd b/spacy/gold.pxd index c5ab6ebbe..bf724868f 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -53,7 +53,7 @@ cdef class TokenAnnotation: cdef public list deps cdef public list entities cdef public list sent_starts - cdef public list brackets + cdef public dict brackets_by_start cdef class DocAnnotation: diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 46a6ae583..1864b7a04 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -658,7 +658,18 @@ cdef class TokenAnnotation: self.deps = deps if deps else [] self.entities = entities if entities else [] self.sent_starts = sent_starts if sent_starts else [] - self.brackets = brackets if brackets else [] + self.brackets_by_start = {} + if brackets: + for b_start, b_end, b_label in brackets: + self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) + + @property + def brackets(self): + brackets = [] + for start, ends_labels in self.brackets_by_start.items(): + for end, label in ends_labels: + brackets.append((start, end, label)) + return brackets @classmethod def from_dict(cls, token_dict): @@ -811,8 +822,10 @@ cdef class Example: s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_brackets = [] sent_start_i = 0 - t = self.token_annotation + cdef TokenAnnotation t = self.token_annotation split_examples = [] + cdef int b_start, b_end + cdef unicode b_label for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: s_example.set_token_annotation(ids=s_ids, @@ -836,9 +849,10 @@ cdef class Example: s_deps.append(t.get_dep(i)) s_ents.append(t.get_entity(i)) s_sent_starts.append(t.get_sent_start(i)) - s_brackets.extend((b[0] - sent_start_i, - b[1] - sent_start_i, b[2]) - for b in t.brackets if b[0] == i) + for b_end, b_label in t.brackets_by_start.get(i, []): + s_brackets.append( + (i - sent_start_i, b_end - sent_start_i, b_label) + ) i += 1 s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, @@ -904,8 +918,10 @@ cdef class Example: examples = [examples] converted_examples = [] for ex in examples: + if isinstance(ex, Example): + converted_examples.append(ex) # convert string to Doc to Example - if isinstance(ex, str): + elif isinstance(ex, str): if keep_raw_text: converted_examples.append(Example(doc=ex)) else: From 4b229bfc220f1c8ab63ac2fa9b17365689d4c5a2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 May 2020 18:48:51 +0200 Subject: [PATCH 119/187] Improve handling of NER in CoNLL-U MISC --- spacy/cli/converters/conllu2json.py | 45 +++++++++++++++-------------- spacy/tests/test_cli.py | 28 +++++++++++++----- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index ecdc2ae66..0b2920802 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -26,7 +26,7 @@ def conllu2json( Extract NER tags if available and convert them so that they follow BILUO and the Wikipedia scheme """ - MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" + MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" msg = Printer(no_print=no_print) n_sents_info(msg, n_sents) docs = [] @@ -39,7 +39,7 @@ def conllu2json( ner_map=ner_map, merge_subtokens=merge_subtokens, ) - has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) + has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) for i, example in enumerate(conll_data): raw += example.text sentences.append( @@ -65,21 +65,20 @@ def conllu2json( def has_ner(input_data, ner_tag_pattern): """ - Check the 10th column of the first token to determine if the file contains - NER tags + Check the MISC column for NER tags. """ for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - if lines: - parts = lines[0].split("\t") + for line in lines: + parts = line.split("\t") id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts - if re.search(ner_tag_pattern, misc): - return True - else: - return False + for misc_part in misc.split("|"): + if re.match(ner_tag_pattern, misc_part): + return True + return False def read_conllx( @@ -127,19 +126,21 @@ def get_entities(lines, tag_pattern, ner_map=None): iob = [] for misc in miscs: - tag_match = re.search(tag_pattern, misc) iob_tag = "O" - if tag_match: - prefix = tag_match.group(2) - suffix = tag_match.group(3) - if prefix and suffix: - iob_tag = prefix + "-" + suffix - if ner_map: - suffix = ner_map.get(suffix, suffix) - if suffix == "": - iob_tag = "O" - else: - iob_tag = prefix + "-" + suffix + for misc_part in misc.split("|"): + tag_match = re.match(tag_pattern, misc_part) + if tag_match: + prefix = tag_match.group(2) + suffix = tag_match.group(3) + if prefix and suffix: + iob_tag = prefix + "-" + suffix + if ner_map: + suffix = ner_map.get(suffix, suffix) + if suffix == "": + iob_tag = "O" + else: + iob_tag = prefix + "-" + suffix + break iob.append(iob_tag) return iob_to_biluo(iob) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 306adc881..132f7ac9f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -29,14 +29,26 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] -def test_cli_converters_conllu2json_name_ner_map(): - lines = [ - "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", - "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", - "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", - "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", - "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", - ] +@pytest.mark.parametrize( + "lines", + [ + ( + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", + ), + ( + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD", + ), + ], +) +def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) assert len(converted) == 1 From f44897e4c67e26228c143afd5edcf716b1e4912f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 18:39:11 +0200 Subject: [PATCH 120/187] Update warning IDs --- spacy/morphology.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 0b53b124c..f7e38bbea 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ def _normalize_props(props): elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value else: - warnings.warn(Warnings.W029.format(feature={key: value})) + warnings.warn(Warnings.W095.format(feature={key: value})) return out @@ -112,7 +112,7 @@ cdef class Morphology: return tag_ptr.key features = self.feats_to_dict(features) if not isinstance(features, dict): - warnings.warn(Warnings.W029.format(feature=features)) + warnings.warn(Warnings.W095.format(feature=features)) features = {} features = _normalize_props(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} From d34fc0915eda68114d088394a7fee304039d0486 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 18:48:21 +0200 Subject: [PATCH 121/187] Remove serialization getter --- spacy/vocab.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ab240df90..19896f07b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -496,7 +496,6 @@ cdef class Vocab: getters = { "strings": lambda: self.strings.to_bytes(), - "lexemes": lambda: self.lexemes_to_bytes(), "vectors": deserialize_vectors, "lookups": lambda: self.lookups.to_bytes(), "lookups_extra": lambda: self.lookups_extra.to_bytes() From 631e20d0c64635dc8d2512ddd068293325ef5ebe Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 19:01:02 +0200 Subject: [PATCH 122/187] Fix test and schemas --- spacy/schemas.py | 1 + spacy/tests/parser/test_ner.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index 3b6313db8..3024326dd 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -62,6 +62,7 @@ class TokenPatternNumber(BaseModel): IN: Optional[List[StrictInt]] = None NOT_IN: Optional[List[StrictInt]] = None EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") + NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index e78cac757..8e41a16c0 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -198,11 +198,7 @@ def test_train_empty(): batches = util.minibatch(train_data) for batch in batches: texts, annotations = zip(*batch) - nlp.update( - texts, # batch of texts - annotations, # batch of annotations - losses=losses, - ) + nlp.update(train_data, losses=losses) def test_overwrite_token(): From f075655debdd35e2cd648bd845b8b966edb5c733 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 19:26:29 +0200 Subject: [PATCH 123/187] Fix shape inference in begin_training --- spacy/ml/models/parser.py | 3 +-- spacy/ml/models/tagger.py | 3 +-- spacy/ml/tb_framework.py | 4 ++-- spacy/pipeline/pipes.pyx | 7 ++++++- spacy/syntax/nn_parser.pyx | 4 ++++ 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 710d36a1d..0e0857ca8 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -15,10 +15,9 @@ def build_tb_parser_model( use_upper=True, nO=None, ): - token_vector_width = tok2vec.get_dim("nO") tok2vec = chain( tok2vec, - with_array(Linear(hidden_width, token_vector_width)), + with_array(Linear(hidden_width)), list2array(), ) tok2vec.set_dim("nO", hidden_width) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 683c8b518..87256cb5c 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -6,9 +6,8 @@ from ...util import registry @registry.architectures.register("spacy.Tagger.v1") def build_tagger_model(tok2vec, nO=None) -> Model: - token_vector_width = tok2vec.get_dim("nO") # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init) + output_layer = Softmax(nO, init_W=zero_init) softmax = with_array(output_layer) model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index e4301a644..251189389 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -38,8 +38,8 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize() - lower = model.get_ref("lower").initialize(X=X) + tok2vec = model.get_ref("tok2vec").initialize(X=X) + lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) model.get_ref("upper").initialize(X=statevecs) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 56fe54664..00c8894fd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -531,7 +531,12 @@ class Tagger(Pipe): vocab.morphology.lemmatizer, exc=vocab.morphology.exc) self.set_output(len(self.labels)) - self.model.initialize() + doc_sample = [Doc(self.vocab, words=["hello", "world"])] + for name, component in pipeline: + if component is self: + break + doc_sample = list(component.pipe(doc_sample)) + self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. link_vectors_to_models(self.vocab) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 31aa4d413..94369a828 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -629,6 +629,10 @@ cdef class Parser: for doc, gold in parses: doc_sample.append(doc) gold_sample.append(gold) + for name, component in pipeline: + if component is self: + break + doc_sample = list(component.pipe(doc_sample)) self.model.initialize(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) From 3b5cfec1fcf34e45d86fd2b133120be13141488a Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 19:32:04 +0200 Subject: [PATCH 124/187] Tweak memory management in train_from_config --- spacy/cli/train_from_config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c75c861cc..eeb21c10c 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -213,6 +213,12 @@ def train_from_config( if is_best_checkpoint and output_path is not None: nlp.to_disk(output_path) progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) + # Clean up the objects to faciliate garbage collection. + for eg in batch: + eg.doc = None + eg.goldparse = None + eg.doc_annotation = None + eg.token_annotation = None finally: if output_path is not None: final_model_path = output_path / "model-final" From 245f91df78e2fd3977ec5b937bac67d3689dd41c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 19:42:13 +0200 Subject: [PATCH 125/187] Fix merge issues --- spacy/gold.pyx | 6 ++++++ spacy/tests/regression/test_issue5137.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 13e448342..5aa7da456 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1050,6 +1050,12 @@ cdef class GoldParse: # avoid allocating memory if the doc does not contain any tokens if self.length == 0: + self.words = [] + self.tags = [] + self.heads = [] + self.labels = [] + self.ner = [] + self.morphs = [] # set a minimal orig so that the scorer can score an empty doc self.orig = TokenAnnotation(ids=[]) else: diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py index 4b4e597d3..e9fd268c8 100644 --- a/spacy/tests/regression/test_issue5137.py +++ b/spacy/tests/regression/test_issue5137.py @@ -21,7 +21,8 @@ def test_issue5137(): def from_disk(self, path, **cfg): pass - Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg) + factory = lambda nlp, model, **cfg: MyComponent(nlp, **cfg) + Language.factories["my_component"] = factory nlp = English() nlp.add_pipe(nlp.create_pipe("my_component")) From 17ee9ab53acd5f39a2684e3442490201b66d2be4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 21 May 2020 19:49:08 +0200 Subject: [PATCH 126/187] Fix _SP/POS=SPACE in strings serialization tests --- .../serialize/test_serialize_vocab_strings.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index f44426a1a..d3e82296e 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -8,6 +8,7 @@ from ..util import make_tempdir test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] test_strings_attrs = [(["rats", "are", "cute"], "Hello")] +default_strings = ("_SP", "POS=SPACE") @pytest.mark.xfail @@ -34,8 +35,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP - assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"]) + assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE + assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -50,15 +51,15 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) # check strings rather than lexemes, which are only reloaded on demand - assert strings1 == [s for s in vocab1_d.strings if s != "_SP"] - assert strings2 == [s for s in vocab2_d.strings if s != "_SP"] + assert strings1 == [s for s in vocab1_d.strings if s not in default_strings] + assert strings2 == [s for s in vocab2_d.strings if s not in default_strings] if strings1 == strings2: - assert [s for s in vocab1_d.strings if s != "_SP"] == [ - s for s in vocab2_d.strings if s != "_SP" + assert [s for s in vocab1_d.strings if s not in default_strings] == [ + s for s in vocab2_d.strings if s not in default_strings ] else: - assert [s for s in vocab1_d.strings if s != "_SP"] != [ - s for s in vocab2_d.strings if s != "_SP" + assert [s for s in vocab1_d.strings if s not in default_strings] != [ + s for s in vocab2_d.strings if s not in default_strings ] @@ -78,7 +79,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) vocab.from_bytes(vocab.to_bytes()) - assert len(vocab.strings) == len(strings) + 1 # adds _SP + assert len(vocab.strings) == len(strings) + 2 # adds _SP and POS=SPACE @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) From 581bda9f985eba04e01c69c2c2f0a978ae6e6684 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:17:14 +0200 Subject: [PATCH 127/187] Update senter test and auto-format --- spacy/tests/pipeline/test_senter.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 197fdca6e..041da2c9f 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -12,14 +12,21 @@ def test_label_types(): with pytest.raises(NotImplementedError): nlp.get_pipe("senter").add_label("A") + SENT_STARTS = [0] * 14 SENT_STARTS[0] = 1 SENT_STARTS[5] = 1 SENT_STARTS[9] = 1 TRAIN_DATA = [ - ("I like green eggs. Eat blue ham. I like purple eggs.", {"sent_starts": SENT_STARTS}), - ("She likes purple eggs. They hate ham. You like yellow eggs.", {"sent_starts": SENT_STARTS}), + ( + "I like green eggs. Eat blue ham. I like purple eggs.", + {"sent_starts": SENT_STARTS}, + ), + ( + "She likes purple eggs. They hate ham. You like yellow eggs.", + {"sent_starts": SENT_STARTS}, + ), ] @@ -36,7 +43,7 @@ def test_overfitting_IO(): assert losses["senter"] < 0.001 # test the trained model - test_text = "I like purple eggs. They eat ham. You like yellow eggs." + test_text = TRAIN_DATA[0][0] doc = nlp(test_text) gold_sent_starts = [0] * 14 gold_sent_starts[0] = 1 From df87c32a4068484471f5ce53b1f4eb7e4f9e4c43 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:17:24 +0200 Subject: [PATCH 128/187] Pass smaller doc sample into model initialize --- spacy/syntax/nn_parser.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 94369a828..ed4697302 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -624,11 +624,12 @@ cdef class Parser: sgd = self.create_optimizer() doc_sample = [] gold_sample = [] - for example in islice(get_examples(), 1000): + for example in islice(get_examples(), 10): parses = example.get_gold_parses(merge=False, vocab=self.vocab) for doc, gold in parses: - doc_sample.append(doc) - gold_sample.append(gold) + if len(doc): + doc_sample.append(doc) + gold_sample.append(gold) for name, component in pipeline: if component is self: break From d507ac28d8db197e8eac6b8c420ef3502af0a006 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:46:10 +0200 Subject: [PATCH 129/187] Fix shape inference --- spacy/ml/models/parser.py | 3 ++- spacy/ml/models/tagger.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 0e0857ca8..bdcd709b1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -15,9 +15,10 @@ def build_tb_parser_model( use_upper=True, nO=None, ): + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain( tok2vec, - with_array(Linear(hidden_width)), + with_array(Linear(hidden_width, t2v_width)), list2array(), ) tok2vec.set_dim("nO", hidden_width) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 87256cb5c..00e268ede 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -7,7 +7,8 @@ from ...util import registry @registry.architectures.register("spacy.Tagger.v1") def build_tagger_model(tok2vec, nO=None) -> Model: # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! - output_layer = Softmax(nO, init_W=zero_init) + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None + output_layer = Softmax(nO, t2v_width, init_W=zero_init) softmax = with_array(output_layer) model = chain(tok2vec, softmax) model.set_ref("tok2vec", tok2vec) From bc94fdabd0ec7362a68f38aa8cbb0b80f818f243 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:46:21 +0200 Subject: [PATCH 130/187] Fix begin_training --- spacy/pipeline/pipes.pyx | 12 ++++++++---- spacy/syntax/nn_parser.pyx | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 00c8894fd..f75ed1659 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -532,10 +532,14 @@ class Tagger(Pipe): exc=vocab.morphology.exc) self.set_output(len(self.labels)) doc_sample = [Doc(self.vocab, words=["hello", "world"])] - for name, component in pipeline: - if component is self: - break - doc_sample = list(component.pipe(doc_sample)) + if pipeline is not None: + for name, component in pipeline: + if component is self: + break + if hasattr(component, "pipe"): + doc_sample = list(component.pipe(doc_sample)) + else: + doc_sample = [component(doc) for doc in doc_sample] self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ed4697302..f8e819268 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -630,11 +630,19 @@ cdef class Parser: if len(doc): doc_sample.append(doc) gold_sample.append(gold) - for name, component in pipeline: - if component is self: - break - doc_sample = list(component.pipe(doc_sample)) - self.model.initialize(doc_sample, gold_sample) + + if pipeline is not None: + for name, component in pipeline: + if component is self: + break + if hasattr(component, "pipe"): + doc_sample = list(component.pipe(doc_sample)) + else: + doc_sample = [component(doc) for doc in doc_sample] + if doc_sample: + self.model.initialize(doc_sample) + else: + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) From 25b51f4fc8a102fd1c83d62d078f071823f222eb Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 May 2020 20:47:52 +0200 Subject: [PATCH 131/187] Set version to v3.0.0.dev9 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 3af1b77a0..04a660ad1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev8" +__version__ = "3.0.0.dev9" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 6e6db6afb62a0377bcd5f0c64220ad05f512c073 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 15:42:46 +0200 Subject: [PATCH 132/187] Better model compatibility and validation --- requirements.txt | 1 + setup.cfg | 1 + spacy/cli/info.py | 4 ++- spacy/cli/package.py | 5 ++-- spacy/cli/train.py | 2 +- spacy/cli/validate.py | 39 +++++++++++++++++---------- spacy/language.py | 2 +- spacy/util.py | 61 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 96 insertions(+), 19 deletions(-) diff --git a/requirements.txt b/requirements.txt index e5f1ae10b..c43ffa7bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 +importlib_metadata>=0.20; python_version < "3.8" # Optional dependencies jsonschema>=2.6.0,<3.1.0 pydantic>=1.3.0,<2.0.0 diff --git a/setup.cfg b/setup.cfg index 1cd088279..eb7608c4e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,6 +56,7 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 tqdm>=4.38.0,<5.0.0 + importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] lookups = diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 23f766368..d779eb2b3 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -48,7 +48,9 @@ def info( "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": ", ".join(model["name"] for model in all_models.values()), + "Models": ", ".join( + f"{m['name']} ({m['version']})" for m in all_models.values() + ), } if not silent: title = "Info about spaCy" diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8e27e44d0..cf93c872f 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg): ("lang", "Model language", meta.get("lang", "en")), ("name", "Model name", meta.get("name", "model")), ("version", "Model version", meta.get("version", "0.0.0")), - ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"), ("description", "Model description", meta.get("description", False)), ("author", "Author", meta.get("author", False)), ("email", "Author email", meta.get("email", False)), ("url", "Author website", meta.get("url", False)), - ("license", "License", meta.get("license", "CC BY-SA 3.0")), + ("license", "License", meta.get("license", "MIT")), ] nlp = util.load_model_from_path(Path(model_path)) + meta["spacy_version"] = about.__version__ meta["pipeline"] = nlp.pipe_names meta["vectors"] = { "width": nlp.vocab.vectors_length, @@ -168,6 +168,7 @@ def setup_package(): package_data={model_name: list_files(model_dir)}, install_requires=list_requirements(meta), zip_safe=False, + entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]} ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 19e0a81e0..c205fa5b2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -467,7 +467,7 @@ def train( # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = f">={about.__version__}" + meta["spacy_version"] = about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index a23ce3453..c39cadc7b 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -4,6 +4,8 @@ import requests from wasabi import msg from .. import about +from ..util import get_package_version, get_installed_models, split_version +from ..util import get_package_path, get_model_meta, is_compatible_model def validate(): @@ -25,7 +27,7 @@ def validate(): msg.info(f"spaCy installation: {spacy_dir}") if model_pkgs: - header = ("NAME", "VERSION", "") + header = ("NAME", "SPACY", "VERSION", "") rows = [] for name, data in model_pkgs.items(): if data["compat"]: @@ -34,7 +36,7 @@ def validate(): else: version = msg.text(data["version"], color="red", no_print=True) comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" - rows.append((data["name"], version, comp)) + rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) @@ -44,8 +46,9 @@ def validate(): cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: - msg.warn( - f"The following models are not available for spaCy v{about.__version__}:", + msg.info( + f"The following models are custom spaCy models or not " + f"available for spaCy v{about.__version__}:", ", ".join(na_models), ) if incompat_models: @@ -53,8 +56,6 @@ def validate(): def get_model_pkgs(): - import pkg_resources - with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: @@ -66,20 +67,30 @@ def get_model_pkgs(): msg.good("Loaded compatibility table") compat = r.json()["spacy"] all_models = set() + installed_models = get_installed_models() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] pkgs = {} - for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): + for pkg_name in installed_models: package = pkg_name.replace("-", "_") - if package in all_models: - version = pkg_data.version - pkgs[pkg_name] = { - "name": package, - "version": version, - "compat": package in compat and version in compat[package], - } + version = get_package_version(pkg_name) + if package in compat: + is_compat = version in compat[package] + v_maj, v_min = split_version(about.__version__) + spacy_version = f"{v_maj}.{v_min}" + else: + model_path = get_package_path(package) + model_meta = get_model_meta(model_path) + is_compat = is_compatible_model(model_meta) + spacy_version = model_meta.get("spacy_version", "n/a") + pkgs[pkg_name] = { + "name": package, + "version": version, + "spacy": spacy_version, + "compat": is_compat, + } return pkgs, compat diff --git a/spacy/language.py b/spacy/language.py index d71c27406..f770cda2c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -196,7 +196,7 @@ class Language(object): self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", f">={about.__version__}") + self._meta.setdefault("spacy_version", about.__version__) self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") diff --git a/spacy/util.py b/spacy/util.py index 7f35c2f7c..5a7c633fa 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -21,9 +21,16 @@ try: except ImportError: cupy = None +try: # Python 3.8 + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + from .symbols import ORTH from .compat import cupy, CudaStream from .errors import Errors, Warnings +from . import about + _PRINT_ENV = False @@ -35,6 +42,10 @@ class registry(thinc.registry): factories = catalogue.create("spacy", "factories", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) assets = catalogue.create("spacy", "assets", entry_points=True) + # This is mostly used to get a list of all installed models in the current + # environment. spaCy models packaged with `spacy package` will "advertise" + # themselves via entry points. + models = catalogue.create("spacy", "models", entry_points=True) def set_env_log(value): @@ -204,6 +215,56 @@ def load_model_from_init_py(init_file, **overrides): return load_model_from_path(data_path, meta, **overrides) +def get_installed_models(): + """List all model packages currently installed in the environment. + + RETURNS (list): The string names of the models. + """ + return list(registry.models.get_all().keys()) + + +def get_package_version(name): + """Get the version of an installed package. Typically used to get model + package versions. + + name (unicode): The name of the installed Python package. + RETURNS (unicode / None): The version or None if package not installed. + """ + try: + return importlib_metadata.version(name) + except importlib_metadata.PackageNotFoundError: + return None + + +def split_version(version): + """RETURNS (tuple): Two integers, the major and minor spaCy version.""" + pieces = version.split(".", 3) + return int(pieces[0]), int(pieces[1]) + + +def is_compatible_model(meta): + """Check if a model is compatible with the current version of spaCy, based + on its meta.json. We compare the version of spaCy the model was created with + with the current version. If the minor version is different, it's considered + incompatible. + + meta (dict): The model's meta. + RETURNS (bool / None): Whether the model is compatible with the current + spaCy or None if we don't have enough info. + """ + cur_v = about.__version__ + pkg_v = meta.get("spacy_version") + if not pkg_v or not isinstance(pkg_v, str): + return None + # Handle spacy_version values like >=x, Date: Fri, 22 May 2020 15:55:45 +0200 Subject: [PATCH 133/187] Guess set_annotations=True in nlp.update During `nlp.update`, components can be passed a boolean set_annotations to indicate whether they should assign annotations to the `Doc`. This needs to be called if downstream components expect to use the annotations during training, e.g. if we wanted to use tagger features in the parser. Components can specify their assignments and requirements, so we can figure out which components have these inter-dependencies. After figuring this out, we can guess whether to pass set_annotations=True. We could also call set_annotations=True always, or even just have this as the only behaviour. The downside of this is that it would require the `Doc` objects to be created afresh to avoid problematic modifications. One approach would be to make a fresh copy of the `Doc` objects within `nlp.update()`, so that we can write to the objects without any problems. If we do that, we can drop this logic and also drop the `set_annotations` mechanism. I would be fine with that approach, although it runs the risk of introducing some performance overhead, and we'll have to take care to copy all extension attributes etc. --- spacy/language.py | 24 +++++++++++++++++++++-- spacy/tests/pipeline/test_pipe_methods.py | 18 ++++++++++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d71c27406..afc988583 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -545,13 +545,14 @@ class Language(object): if component_cfg is None: component_cfg = {} + component_deps = _count_pipeline_inter_dependencies(self.pipeline) # Determine whether component should set annotations. In theory I guess # we should do this by inspecting the meta? Or we could just always # say "yes" - for name, proc in self.pipeline: + for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) component_cfg[name].setdefault("drop", drop) - component_cfg[name].setdefault("set_annotations", False) + component_cfg[name]["set_annotations"] = bool(component_deps[i]) for name, proc in self.pipeline: if not hasattr(proc, "update"): continue @@ -1159,6 +1160,25 @@ class DisabledPipes(list): self[:] = [] +def _count_pipeline_inter_dependencies(pipeline): + """Count how many subsequent components require an annotation set by each + component in the pipeline. + """ + pipe_assigns = [] + pipe_requires = [] + for name, pipe in pipeline: + pipe_assigns.append(set(getattr(pipe, "assigns", []))) + pipe_requires.append(set(getattr(pipe, "requires", []))) + counts = [] + for i, assigns in enumerate(pipe_assigns): + count = 0 + for requires in pipe_requires[i+1:]: + if assigns.intersection(requires): + count += 1 + counts.append(count) + return counts + + def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index d42216655..0397d490d 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,5 +1,5 @@ import pytest -from spacy.language import Language +from spacy.language import Language, _count_pipeline_inter_dependencies @pytest.fixture @@ -198,3 +198,19 @@ def test_pipe_labels(nlp): assert len(nlp.pipe_labels) == len(input_labels) for name, labels in nlp.pipe_labels.items(): assert sorted(input_labels[name]) == sorted(labels) + + +def test_pipe_inter_dependencies(): + class Fancifier: + name = "fancifier" + assigns = ("doc._.fancy",) + requires = tuple() + + class FancyNeeder: + name = "needer" + assigns = tuple() + requires = ("doc._.fancy",) + + pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] + counts = _count_pipeline_inter_dependencies(pipeline) + assert counts == [1, 0] From f7f6df7275ea2884fc47fa7823c6bcba1caa5cb4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 May 2020 16:43:18 +0200 Subject: [PATCH 134/187] Move to spacy.analysis --- spacy/analysis.py | 21 +++++++++++++++++++++ spacy/language.py | 22 ++-------------------- spacy/tests/pipeline/test_analysis.py | 17 +++++++++++++++++ spacy/tests/pipeline/test_pipe_methods.py | 18 +----------------- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/spacy/analysis.py b/spacy/analysis.py index c2600048f..41591661c 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -173,3 +173,24 @@ def print_summary(nlp, pretty=True, no_print=False): msg.good("No problems found.") if no_print: return {"overview": overview, "problems": problems} + + +def count_pipeline_interdependencies(pipeline): + """Count how many subsequent components require an annotation set by each + component in the pipeline. + """ + pipe_assigns = [] + pipe_requires = [] + for name, pipe in pipeline: + pipe_assigns.append(set(getattr(pipe, "assigns", []))) + pipe_requires.append(set(getattr(pipe, "requires", []))) + counts = [] + for i, assigns in enumerate(pipe_assigns): + count = 0 + for requires in pipe_requires[i+1:]: + if assigns.intersection(requires): + count += 1 + counts.append(count) + return counts + + diff --git a/spacy/language.py b/spacy/language.py index afc988583..b228c2155 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,6 +18,7 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .analysis import count_pipeline_interdependencies from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry @@ -545,7 +546,7 @@ class Language(object): if component_cfg is None: component_cfg = {} - component_deps = _count_pipeline_inter_dependencies(self.pipeline) + component_deps = count_pipeline_interdependencies(self.pipeline) # Determine whether component should set annotations. In theory I guess # we should do this by inspecting the meta? Or we could just always # say "yes" @@ -1160,25 +1161,6 @@ class DisabledPipes(list): self[:] = [] -def _count_pipeline_inter_dependencies(pipeline): - """Count how many subsequent components require an annotation set by each - component in the pipeline. - """ - pipe_assigns = [] - pipe_requires = [] - for name, pipe in pipeline: - pipe_assigns.append(set(getattr(pipe, "assigns", []))) - pipe_requires.append(set(getattr(pipe, "requires", []))) - counts = [] - for i, assigns in enumerate(pipe_assigns): - count = 0 - for requires in pipe_requires[i+1:]: - if assigns.intersection(requires): - count += 1 - counts.append(count) - return counts - - def _pipe(examples, proc, kwargs): # We added some args for pipe that __call__ doesn't expect. kwargs = dict(kwargs) diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index cda39f6ee..e608f2c34 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -2,6 +2,7 @@ import spacy.language from spacy.language import Language, component from spacy.analysis import print_summary, validate_attrs from spacy.analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.analysis import count_pipeline_interdependencies from mock import Mock, ANY import pytest @@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe(): with pytest.warns(None) as record: nlp.remove_pipe("c2") assert not record.list + + +def test_pipe_interdependencies(): + class Fancifier: + name = "fancifier" + assigns = ("doc._.fancy",) + requires = tuple() + + class FancyNeeder: + name = "needer" + assigns = tuple() + requires = ("doc._.fancy",) + + pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] + counts = count_pipeline_interdependencies(pipeline) + assert counts == [1, 0] diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0397d490d..d42216655 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,5 +1,5 @@ import pytest -from spacy.language import Language, _count_pipeline_inter_dependencies +from spacy.language import Language @pytest.fixture @@ -198,19 +198,3 @@ def test_pipe_labels(nlp): assert len(nlp.pipe_labels) == len(input_labels) for name, labels in nlp.pipe_labels.items(): assert sorted(input_labels[name]) == sorted(labels) - - -def test_pipe_inter_dependencies(): - class Fancifier: - name = "fancifier" - assigns = ("doc._.fancy",) - requires = tuple() - - class FancyNeeder: - name = "needer" - assigns = tuple() - requires = ("doc._.fancy",) - - pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())] - counts = _count_pipeline_inter_dependencies(pipeline) - assert counts == [1, 0] From 12b7be1d9874048c1f3f20dffb833a88308544c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 16:49:26 +0200 Subject: [PATCH 135/187] Remove jsonschema from dependencies --- Makefile | 4 ++-- requirements.txt | 2 -- spacy/tests/package/test_requirements.py | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index cf96d6294..9916e3cf5 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + $(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/requirements.txt b/requirements.txt index c43ffa7bb..add083a05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,8 +14,6 @@ requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 importlib_metadata>=0.20; python_version < "3.8" -# Optional dependencies -jsonschema>=2.6.0,<3.1.0 pydantic>=1.3.0,<2.0.0 # Development dependencies cython>=0.25 diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 59a8569ee..0dc0f9d6c 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -9,7 +9,6 @@ def test_build_dependencies(): "pytest-timeout", "mock", "flake8", - "jsonschema", ] libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] From d844528c5f62f27904d6925f16cc7d1ee3e16949 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 16:55:15 +0200 Subject: [PATCH 136/187] Add test for is_compatible_model --- spacy/tests/test_misc.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index c320b19c0..0a0f4c7be 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -2,6 +2,7 @@ import pytest import os import ctypes from pathlib import Path +from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding @@ -87,3 +88,11 @@ def test_ascii_filenames(): root = Path(__file__).parent.parent for path in root.glob("**/*"): assert all(ord(c) < 128 for c in path.name), path.name + + +@pytest.mark.parametrize( + "version,compatible", + [(spacy_version, True), ("2.0.0", False), (">=1.2.3,<4.5.6", False)], +) +def test_is_compatible_model(version, compatible): + assert util.is_compatible_model({"spacy_version": version}) is compatible From 569a65b60e4205311817d1a8add57fa16b407de7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 16:55:42 +0200 Subject: [PATCH 137/187] Auto-format --- spacy/tests/test_misc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 0a0f4c7be..ddf1bb332 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -5,7 +5,8 @@ from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding +from spacy.ml._precomputable_affine import PrecomputableAffine +from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding @pytest.fixture From 4465cad6c5bc188f628dc92183e2e855e26bcfc4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 17:42:06 +0200 Subject: [PATCH 138/187] Rename spacy.analysis to spacy.pipe_analysis --- spacy/language.py | 23 ++++++++++++++++++----- spacy/{analysis.py => pipe_analysis.py} | 4 +--- spacy/tests/pipeline/test_analysis.py | 8 ++++---- 3 files changed, 23 insertions(+), 12 deletions(-) rename spacy/{analysis.py => pipe_analysis.py} (99%) diff --git a/spacy/language.py b/spacy/language.py index 8c44cf26b..5286bd3b9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,8 +17,8 @@ from .tokens.underscore import Underscore from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups -from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs -from .analysis import count_pipeline_interdependencies +from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .pipe_analysis import count_pipeline_interdependencies from .gold import Example from .scorer import Scorer from .util import link_vectors_to_models, create_default_optimizer, registry @@ -318,14 +318,18 @@ class Language(object): # check whether we have a proper model config, or load a default one if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): - warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)) + warnings.warn( + Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name) + ) # refer to the model configuration in the cfg settings for this component if "model" in factory_cfg: self.config[name] = {"model": factory_cfg["model"]} # create all objects in the config - factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"] + factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)[ + "config" + ] model = factory_cfg.get("model", None) if model is not None: del factory_cfg["model"] @@ -519,7 +523,16 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None): + def update( + self, + examples, + dummy=None, + *, + drop=0.0, + sgd=None, + losses=None, + component_cfg=None, + ): """Update the models in the pipeline. examples (iterable): A batch of `Example` or `Doc` objects. diff --git a/spacy/analysis.py b/spacy/pipe_analysis.py similarity index 99% rename from spacy/analysis.py rename to spacy/pipe_analysis.py index 41591661c..4c0950453 100644 --- a/spacy/analysis.py +++ b/spacy/pipe_analysis.py @@ -187,10 +187,8 @@ def count_pipeline_interdependencies(pipeline): counts = [] for i, assigns in enumerate(pipe_assigns): count = 0 - for requires in pipe_requires[i+1:]: + for requires in pipe_requires[i + 1 :]: if assigns.intersection(requires): count += 1 counts.append(count) return counts - - diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index e608f2c34..b826438f5 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,8 +1,8 @@ import spacy.language from spacy.language import Language, component -from spacy.analysis import print_summary, validate_attrs -from spacy.analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.analysis import count_pipeline_interdependencies +from spacy.pipe_analysis import print_summary, validate_attrs +from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.pipe_analysis import count_pipeline_interdependencies from mock import Mock, ANY import pytest @@ -169,7 +169,7 @@ def test_pipe_interdependencies(): name = "fancifier" assigns = ("doc._.fancy",) requires = tuple() - + class FancyNeeder: name = "needer" assigns = tuple() From 2d9de8684df7d28477986eb497e13b403c03d9d9 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Fri, 22 May 2020 23:10:40 +0200 Subject: [PATCH 139/187] Support use_pytorch_for_gpu_memory config --- spacy/cli/train_from_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index eeb21c10c..c0e3bd169 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -7,7 +7,7 @@ from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model +from thinc.api import Model, use_pytorch_for_gpu_memory import random from ..gold import GoldCorpus @@ -171,6 +171,8 @@ def train_from_config( msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) + if config["training"]["use_pytorch_for_gpu_memory"]: + use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") From f9786d765edf16afa092cf378a0a45fb321efe22 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 14:48:56 +0200 Subject: [PATCH 140/187] Simplify is_package check --- spacy/cli/download.py | 18 ++---------------- spacy/util.py | 13 +++++-------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0230e272d..af132bbbe 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -5,6 +5,7 @@ import sys from wasabi import msg from .. import about +from ..util import is_package def download( @@ -17,7 +18,7 @@ def download( flag is set, the command expects the full model name with version. For direct downloads, the compatibility check will be skipped. """ - if not require_package("spacy") and "--no-deps" not in pip_args: + if not is_package("spacy") and "--no-deps" not in pip_args: msg.warn( "Skipping model package dependencies and setting `--no-deps`. " "You don't seem to have the spaCy package itself installed " @@ -45,21 +46,6 @@ def download( "Download and installation successful", f"You can now load the model via spacy.load('{model_name}')", ) - # If a model is downloaded and then loaded within the same process, our - # is_package check currently fails, because pkg_resources.working_set - # is not refreshed automatically (see #3923). We're trying to work - # around this here be requiring the package explicitly. - require_package(model_name) - - -def require_package(name): - try: - import pkg_resources - - pkg_resources.working_set.require(name) - return True - except: # noqa: E722 - return False def get_json(url, desc): diff --git a/spacy/util.py b/spacy/util.py index 5a7c633fa..41af881c9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -341,14 +341,11 @@ def is_package(name): name (unicode): Name of package. RETURNS (bool): True if installed package, False if not. """ - import pkg_resources - - name = name.lower() # compare package name against lowercase name - packages = pkg_resources.working_set.by_key.keys() - for package in packages: - if package.lower().replace("-", "_") == name: - return True - return False + try: + importlib_metadata.distribution(name) + return True + except: # noqa: E722 + return False def get_package_path(name): From 387c7aba15228557cdbbfae0ee3ab90009769584 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 14:55:16 +0200 Subject: [PATCH 141/187] Update test --- spacy/tests/test_misc.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index ddf1bb332..9e67ae83b 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -26,10 +26,12 @@ def test_util_ensure_path_succeeds(text): assert isinstance(path, Path) -@pytest.mark.parametrize("package", ["numpy"]) -def test_util_is_package(package): +@pytest.mark.parametrize( + "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)] +) +def test_util_is_package(package, result): """Test that an installed package via pip is recognised by util.is_package.""" - assert util.is_package(package) + assert util.is_package(package) is result @pytest.mark.parametrize("package", ["thinc"]) From 5d3806e059178c9516fb6cf57064cb10cfbf0f29 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 17:20:58 +0200 Subject: [PATCH 142/187] unicode -> str consistency --- spacy/cli/converters/conllu2json.py | 8 ++-- spacy/displacy/__init__.py | 8 ++-- spacy/displacy/render.py | 26 +++++------ spacy/errors.py | 2 +- spacy/glossary.py | 4 +- spacy/kb.pyx | 4 +- spacy/language.py | 26 +++++------ spacy/lemmatizer.py | 10 ++--- spacy/lexeme.pyx | 20 ++++----- spacy/lookups.py | 28 ++++++------ spacy/matcher/dependencymatcher.pyx | 2 +- spacy/matcher/matcher.pyx | 6 +-- spacy/matcher/phrasematcher.pyx | 6 +-- spacy/morphology.pyx | 4 +- spacy/pipe_analysis.py | 6 +-- spacy/pipeline/entityruler.py | 4 +- spacy/pipeline/functions.py | 2 +- spacy/strings.pyx | 6 +-- spacy/tokenizer.pyx | 16 +++---- spacy/tokens/doc.pyx | 14 +++--- spacy/tokens/span.pyx | 22 +++++----- spacy/tokens/token.pyx | 44 +++++++++---------- spacy/util.py | 36 +++++++-------- spacy/vectors.pyx | 8 ++-- spacy/vocab.pyx | 6 +-- website/docs/api/lexeme.md | 16 +++---- website/docs/api/vocab.md | 30 ++++++------- website/docs/usage/rule-based-matching.md | 53 +++++++++++++---------- 28 files changed, 212 insertions(+), 205 deletions(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 0b2920802..1ece755b8 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None): final entity type with `ner_map` if mapping present. Entity tag is 'O' if the pattern is not matched. - lines (unicode): CONLL-U lines for one sentences - tag_pattern (unicode): Regex pattern for entity tag + lines (str): CONLL-U lines for one sentences + tag_pattern (str): Regex pattern for entity tag ner_map (dict): Map old NER tag names to new ones, '' maps to O. RETURNS (list): List of BILUO entity tags """ @@ -187,8 +187,8 @@ def example_from_conllu_sentence( """Create an Example from the lines for one CoNLL-U sentence, merging subtokens and appending morphology to tags if required. - lines (unicode): The non-comment lines for a CoNLL-U sentence - ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col + lines (str): The non-comment lines for a CoNLL-U sentence + ner_tag_pattern (str): The regex pattern for matching NER in MISC col RETURNS (Example): An example containing the annotation """ # create a Doc with each subtoken as its own token diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3f84dabce..2c377a043 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -22,13 +22,13 @@ def render( """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. DOCS: https://spacy.io/api/top-level#displacy.render USAGE: https://spacy.io/usage/visualizers @@ -73,13 +73,13 @@ def serve( """Serve displaCy visualisation. docs (list or Doc): Document(s) to visualise. - style (unicode): Visualisation style, 'dep' or 'ent'. + style (str): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. - host (unicode): Host to serve visualisation. + host (str): Host to serve visualisation. DOCS: https://spacy.io/api/top-level#displacy.serve USAGE: https://spacy.io/usage/visualizers diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 0d4cdb77f..d3572ce78 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -47,7 +47,7 @@ class DependencyRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered SVG or HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ # Create a random ID prefix to make sure parses don't receive the # same ID, even if they're identical @@ -78,7 +78,7 @@ class DependencyRenderer(object): render_id (int): Unique ID, typically index of document. words (list): Individual words and their tags. arcs (list): Individual arcs and their start, end, direction and label. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ self.levels = self.get_levels(arcs) self.highest_level = len(self.levels) @@ -112,10 +112,10 @@ class DependencyRenderer(object): ): """Render individual word. - text (unicode): Word text. - tag (unicode): Part-of-speech tag. + text (str): Word text. + tag (str): Part-of-speech tag. i (int): Unique ID, typically word index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ y = self.offset_y + self.word_spacing x = self.offset_x + i * self.distance @@ -131,12 +131,12 @@ class DependencyRenderer(object): def render_arrow(self, label, start, end, direction, i): """Render individual arrow. - label (unicode): Dependency label. + label (str): Dependency label. start (int): Index of start word. end (int): Index of end word. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. i (int): Unique ID, typically arrow index. - RETURNS (unicode): Rendered SVG markup. + RETURNS (str): Rendered SVG markup. """ if start < 0 or end < 0: error_args = dict(start=start, end=end, label=label, dir=direction) @@ -179,7 +179,7 @@ class DependencyRenderer(object): y (int): Y-coordinate of arrow start and end point. y_curve (int): Y-corrdinate of Cubic Bézier y_curve point. x_end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arc path ('d' attribute). + RETURNS (str): Definition of the arc path ('d' attribute). """ template = "M{x},{y} C{x},{c} {e},{c} {e},{y}" if self.compact: @@ -189,11 +189,11 @@ class DependencyRenderer(object): def get_arrowhead(self, direction, x, y, end): """Render individual arrow head. - direction (unicode): Arrow direction, 'left' or 'right'. + direction (str): Arrow direction, 'left' or 'right'. x (int): X-coordinate of arrow start point. y (int): Y-coordinate of arrow start and end point. end (int): X-coordinate of arrow end point. - RETURNS (unicode): Definition of the arrow head path ('d' attribute). + RETURNS (str): Definition of the arrow head path ('d' attribute). """ if direction == "left": pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2) @@ -279,7 +279,7 @@ class EntityRenderer(object): parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (unicode): Rendered HTML markup. + RETURNS (str): Rendered HTML markup. """ rendered = [] for i, p in enumerate(parsed): @@ -300,7 +300,7 @@ class EntityRenderer(object): def render_ents(self, text, spans, title): """Render entities in text. - text (unicode): Original text. + text (str): Original text. spans (list): Individual entity spans and their start, end and label. title (unicode or None): Document title set in Doc.user_data['title']. """ diff --git a/spacy/errors.py b/spacy/errors.py index 4d38ab586..932bb1eff 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -598,7 +598,7 @@ class MatchPatternError(ValueError): def __init__(self, key, errors): """Custom error for validating match patterns. - key (unicode): The name of the matcher rule. + key (str): The name of the matcher rule. errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ diff --git a/spacy/glossary.py b/spacy/glossary.py index 938a575cd..c4a6a5c45 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,8 +1,8 @@ def explain(term): """Get a description for a given POS tag, dependency label or entity type. - term (unicode): The term to explain. - RETURNS (unicode): The explanation, or `None` if not found in the glossary. + term (str): The term to explain. + RETURNS (str): The explanation, or `None` if not found in the glossary. EXAMPLE: >>> spacy.explain(u'NORP') diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 86a8d49b8..8d8464f3c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -38,7 +38,7 @@ cdef class Candidate: @property def entity_(self): - """RETURNS (unicode): ID/name of this entity in the KB""" + """RETURNS (str): ID/name of this entity in the KB""" return self.kb.vocab.strings[self.entity_hash] @property @@ -48,7 +48,7 @@ cdef class Candidate: @property def alias_(self): - """RETURNS (unicode): ID of the original alias""" + """RETURNS (str): ID of the original alias""" return self.kb.vocab.strings[self.alias_hash] @property diff --git a/spacy/language.py b/spacy/language.py index 5286bd3b9..e3b770723 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -122,7 +122,7 @@ class Language(object): Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (unicode): Two-letter language ID, i.e. ISO code. + lang (str): Two-letter language ID, i.e. ISO code. DOCS: https://spacy.io/api/language """ @@ -287,7 +287,7 @@ class Language(object): def get_pipe(self, name): """Get a pipeline component for a given component name. - name (unicode): Name of pipeline component to get. + name (str): Name of pipeline component to get. RETURNS (callable): The pipeline component. DOCS: https://spacy.io/api/language#get_pipe @@ -300,7 +300,7 @@ class Language(object): def create_pipe(self, name, config=dict()): """Create a pipeline component from a factory. - name (unicode): Factory name to look up in `Language.factories`. + name (str): Factory name to look up in `Language.factories`. config (dict): Configuration parameters to initialise component. RETURNS (callable): Pipeline component. @@ -343,12 +343,12 @@ class Language(object): of before/after/first/last can be set. Default behaviour is "last". component (callable): The pipeline component. - name (unicode): Name of pipeline component. Overwrites existing + name (str): Name of pipeline component. Overwrites existing component.name attribute if available. If no name is set and the component exposes no name attribute, component.__name__ is used. An error is raised if a name already exists in the pipeline. - before (unicode): Component name to insert component directly before. - after (unicode): Component name to insert component directly after. + before (str): Component name to insert component directly before. + after (str): Component name to insert component directly after. first (bool): Insert component first / not first in the pipeline. last (bool): Insert component last / not last in the pipeline. @@ -389,7 +389,7 @@ class Language(object): """Check if a component name is present in the pipeline. Equivalent to `name in nlp.pipe_names`. - name (unicode): Name of the component. + name (str): Name of the component. RETURNS (bool): Whether a component of the name exists in the pipeline. DOCS: https://spacy.io/api/language#has_pipe @@ -399,7 +399,7 @@ class Language(object): def replace_pipe(self, name, component): """Replace a component in the pipeline. - name (unicode): Name of the component to replace. + name (str): Name of the component to replace. component (callable): Pipeline component. DOCS: https://spacy.io/api/language#replace_pipe @@ -418,8 +418,8 @@ class Language(object): def rename_pipe(self, old_name, new_name): """Rename a pipeline component. - old_name (unicode): Name of the component to rename. - new_name (unicode): New name of the component. + old_name (str): Name of the component to rename. + new_name (str): New name of the component. DOCS: https://spacy.io/api/language#rename_pipe """ @@ -433,7 +433,7 @@ class Language(object): def remove_pipe(self, name): """Remove a component from the pipeline. - name (unicode): Name of the component to remove. + name (str): Name of the component to remove. RETURNS (tuple): A `(name, component)` tuple of the removed component. DOCS: https://spacy.io/api/language#remove_pipe @@ -450,7 +450,7 @@ class Language(object): and can contain arbitrary whitespace. Alignment into the original string is preserved. - text (unicode): The text to be processed. + text (str): The text to be processed. disable (list): Names of the pipeline components to disable. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. @@ -1086,7 +1086,7 @@ class component(object): ): """Decorate a pipeline component. - name (unicode): Default component and factory name. + name (str): Default component and factory name. assigns (list): Attributes assigned by component, e.g. `["token.pos"]`. requires (list): Attributes required by component, e.g. `["token.dep"]`. retokenizes (bool): Whether the component changes the tokenization. diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 3ba86c169..aeedbde84 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -29,8 +29,8 @@ class Lemmatizer(object): def __call__(self, string, univ_pos, morphology=None): """Lemmatize a string. - string (unicode): The string to lemmatize, e.g. the token text. - univ_pos (unicode / int): The token's universal part-of-speech tag. + string (str): The string to lemmatize, e.g. the token text. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. RETURNS (list): The available lemmas for the string. @@ -69,7 +69,7 @@ class Lemmatizer(object): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. """ @@ -128,10 +128,10 @@ class Lemmatizer(object): """Look up a lemma in the table, if available. If no lemma is found, the original string is returned. - string (unicode): The original string. + string (str): The original string. orth (int): Optional hash of the string to look up. If not set, the string will be used and hashed. - RETURNS (unicode): The lemma if the string was found, otherwise the + RETURNS (str): The lemma if the string was found, otherwise the original string. """ lookup_table = self.lookups.get_table("lemma_lookup", {}) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 20e175f03..911112d50 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -190,7 +190,7 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: - """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used + """RETURNS (str): Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id @@ -209,18 +209,18 @@ cdef class Lexeme: @property def orth_(self): - """RETURNS (unicode): The original verbatim text of the lexeme + """RETURNS (str): The original verbatim text of the lexeme (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.""" return self.vocab.strings[self.c.orth] @property def text(self): - """RETURNS (unicode): The original verbatim text of the lexeme.""" + """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ property lower: - """RETURNS (unicode): Lowercase form of the lexeme.""" + """RETURNS (str): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower @@ -293,7 +293,7 @@ cdef class Lexeme: self.c.prob = x property lower_: - """RETURNS (unicode): Lowercase form of the word.""" + """RETURNS (str): Lowercase form of the word.""" def __get__(self): return self.vocab.strings[self.c.lower] @@ -301,7 +301,7 @@ cdef class Lexeme: self.c.lower = self.vocab.strings.add(x) property norm_: - """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the + """RETURNS (str): The lexemes's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -311,7 +311,7 @@ cdef class Lexeme: self.c.norm = self.vocab.strings.add(x) property shape_: - """RETURNS (unicode): Transform of the word's string, to show + """RETURNS (str): Transform of the word's string, to show orthographic features. """ def __get__(self): @@ -321,7 +321,7 @@ cdef class Lexeme: self.c.shape = self.vocab.strings.add(x) property prefix_: - """RETURNS (unicode): Length-N substring from the start of the word. + """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ def __get__(self): @@ -331,7 +331,7 @@ cdef class Lexeme: self.c.prefix = self.vocab.strings.add(x) property suffix_: - """RETURNS (unicode): Length-N substring from the end of the word. + """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ def __get__(self): @@ -341,7 +341,7 @@ cdef class Lexeme: self.c.suffix = self.vocab.strings.add(x) property lang_: - """RETURNS (unicode): Language of the parent vocabulary.""" + """RETURNS (str): Language of the parent vocabulary.""" def __get__(self): return self.vocab.strings[self.c.lang] diff --git a/spacy/lookups.py b/spacy/lookups.py index a9d371b79..5661897e1 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -31,7 +31,7 @@ class Lookups(object): """Check if the lookups contain a table of a given name. Delegates to Lookups.has_table. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name is in the lookups. """ return self.has_table(name) @@ -48,7 +48,7 @@ class Lookups(object): def add_table(self, name, data=SimpleFrozenDict()): """Add a new table to the lookups. Raises an error if the table exists. - name (unicode): Unique name of table. + name (str): Unique name of table. data (dict): Optional data to add to the table. RETURNS (Table): The newly added table. @@ -64,7 +64,7 @@ class Lookups(object): """Get a table. Raises an error if the table doesn't exist and no default value is provided. - name (unicode): Name of the table. + name (str): Name of the table. default: Optional default value to return if table doesn't exist. RETURNS (Table): The table. @@ -79,7 +79,7 @@ class Lookups(object): def remove_table(self, name): """Remove a table. Raises an error if the table doesn't exist. - name (unicode): Name of the table to remove. + name (str): Name of the table to remove. RETURNS (Table): The removed table. DOCS: https://spacy.io/api/lookups#remove_table @@ -91,7 +91,7 @@ class Lookups(object): def has_table(self, name): """Check if the lookups contain a table of a given name. - name (unicode): Name of the table. + name (str): Name of the table. RETURNS (bool): Whether a table of that name exists. DOCS: https://spacy.io/api/lookups#has_table @@ -125,7 +125,7 @@ class Lookups(object): """Save the lookups to a directory as lookups.bin. Expects a path to a directory, which will be created if it doesn't exist. - path (unicode / Path): The file path. + path (str / Path): The file path. DOCS: https://spacy.io/api/lookups#to_disk """ @@ -141,7 +141,7 @@ class Lookups(object): """Load lookups from a directory containing a lookups.bin. Will skip loading if the file doesn't exist. - path (unicode / Path): The directory path. + path (str / Path): The directory path. RETURNS (Lookups): The loaded lookups. DOCS: https://spacy.io/api/lookups#from_disk @@ -167,7 +167,7 @@ class Table(OrderedDict): """Initialize a new table from a dict. data (dict): The dictionary. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict @@ -179,7 +179,7 @@ class Table(OrderedDict): def __init__(self, name=None, data=None): """Initialize a new table. - name (unicode): Optional table name for reference. + name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. @@ -197,7 +197,7 @@ class Table(OrderedDict): def __setitem__(self, key, value): """Set new key/value pair. String keys will be hashed. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ key = get_string_id(key) @@ -208,7 +208,7 @@ class Table(OrderedDict): """Set new key/value pair. String keys will be hashed. Same as table[key] = value. - key (unicode / int): The key to set. + key (str / int): The key to set. value: The value to set. """ self[key] = value @@ -216,7 +216,7 @@ class Table(OrderedDict): def __getitem__(self, key): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. RETURNS: The value. """ key = get_string_id(key) @@ -225,7 +225,7 @@ class Table(OrderedDict): def get(self, key, default=None): """Get the value for a given key. String keys will be hashed. - key (unicode / int): The key to get. + key (str / int): The key to get. default: The default value to return. RETURNS: The value. """ @@ -235,7 +235,7 @@ class Table(OrderedDict): def __contains__(self, key): """Check whether a key is in the table. String keys will be hashed. - key (unicode / int): The key to check. + key (str / int): The key to check. RETURNS (bool): Whether the key is in the table. """ key = get_string_id(key) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ff707a71c..732931380 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -66,7 +66,7 @@ cdef class DependencyMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2bcb82a2a..225eba9a9 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -63,7 +63,7 @@ cdef class Matcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ return self._normalize_key(key) in self._patterns @@ -97,7 +97,7 @@ cdef class Matcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. *_patterns (list): For backwards compatibility: list of patterns to add @@ -138,7 +138,7 @@ cdef class Matcher: """Remove a rule from the matcher. A KeyError is raised if the key does not exist. - key (unicode): The ID of the match rule. + key (str): The ID of the match rule. """ norm_key = self._normalize_key(key) if not norm_key in self._patterns: diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 14cc39787..f7ce44ece 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -70,7 +70,7 @@ cdef class PhraseMatcher: def __contains__(self, key): """Check whether the matcher contains rules for a match ID. - key (unicode): The match ID. + key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. DOCS: https://spacy.io/api/phrasematcher#contains @@ -85,7 +85,7 @@ cdef class PhraseMatcher: """Remove a rule from the matcher by match ID. A KeyError is raised if the key does not exist. - key (unicode): The match ID. + key (str): The match ID. DOCS: https://spacy.io/api/phrasematcher#remove """ @@ -159,7 +159,7 @@ cdef class PhraseMatcher: number of arguments). The on_match callback becomes an optional keyword argument. - key (unicode): The match ID. + key (str): The match ID. docs (list): List of `Doc` objects representing match patterns. on_match (callable): Callback executed on match. *_docs (Doc): For backwards compatibility: list of patterns to add diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 0b53b124c..5dcf81ea7 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -198,8 +198,8 @@ cdef class Morphology: """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. + tag (str): The part-of-speech tag to key the exception. + orth (str): The word-form to key the exception. """ attrs = dict(attrs) attrs = _normalize_props(attrs) diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index 4c0950453..971ebe518 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): fulfilled (e.g. if previous components assign the attributes). pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - name (unicode): The name of the pipeline component to analyze. + name (str): The name of the pipeline component to analyze. pipe (callable): The pipeline component function to analyze. index (int): The index of the component in the pipeline. warn (bool): Show user warning if problem is found. @@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr): """Get all pipeline components that assign an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that assign the attr. """ return _get_feature_for_attr(pipeline, attr, "assigns") @@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr): """Get all pipeline components that require an attr, e.g. "doc.tensor". pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. - attr (unicode): The attribute to check. + attr (str): The attribute to check. RETURNS (list): (name, pipeline) tuples of components that require the attr. """ return _get_feature_for_attr(pipeline, attr, "requires") diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 58160c2e9..cdacc82f6 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -315,7 +315,7 @@ class EntityRuler(object): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. - path (unicode / Path): The JSONL file to load. + path (str / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. @@ -351,7 +351,7 @@ class EntityRuler(object): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). - path (unicode / Path): The JSONL file to save. + path (str / Path): The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. DOCS: https://spacy.io/api/entityruler#to_disk diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 6e9d4197c..622791512 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"): """Merge subtokens into a single token. doc (Doc): The Doc object. - label (unicode): The subtoken dependency label. + label (str): The subtoken dependency label. RETURNS (Doc): The Doc object with merged subtokens. DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a30f11729..9fe5af154 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -152,7 +152,7 @@ cdef class StringStore: def add(self, string): """Add a string to the StringStore. - string (unicode): The string to add. + string (str): The string to add. RETURNS (uint64): The string's hash value. """ if isinstance(string, unicode): @@ -179,7 +179,7 @@ cdef class StringStore: def __contains__(self, string not None): """Check whether a string is in the store. - string (unicode): The string to check. + string (str): The string to check. RETURNS (bool): Whether the store contains the string. """ cdef hash_t key @@ -205,7 +205,7 @@ cdef class StringStore: def __iter__(self): """Iterate over the strings in the store, in order. - YIELDS (unicode): A string in the store. + YIELDS (str): A string in the store. """ cdef int i cdef hash_t key diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 7e75052f7..b628b1171 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -134,7 +134,7 @@ cdef class Tokenizer: def __call__(self, unicode string): """Tokenize a string. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. DOCS: https://spacy.io/api/tokenizer#call @@ -147,7 +147,7 @@ cdef class Tokenizer: cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): """Tokenize according to affix and token_match settings. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): @@ -527,7 +527,7 @@ cdef class Tokenizer: def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (list): A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. @@ -542,7 +542,7 @@ cdef class Tokenizer: """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. - string (unicode): The string to segment. + string (str): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_prefix @@ -556,7 +556,7 @@ cdef class Tokenizer: """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. - string (unicode): The string to segment. + string (str): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. DOCS: https://spacy.io/api/tokenizer#find_suffix @@ -576,7 +576,7 @@ cdef class Tokenizer: def _validate_special_case(self, chunk, substrings): """Check whether the `ORTH` fields match the string. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. """ @@ -588,7 +588,7 @@ cdef class Tokenizer: def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. - string (unicode): The string to specially tokenize. + string (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. @@ -629,7 +629,7 @@ cdef class Tokenizer: produced are identical to `nlp.tokenizer()` except for whitespace tokens. - string (unicode): The string to tokenize. + string (str): The string to tokenize. RETURNS (list): A list of (pattern_string, token_string) tuples DOCS: https://spacy.io/api/tokenizer#explain diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0716b2b3d..f6d0dbf4a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -107,7 +107,7 @@ cdef class Doc: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Doc._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -125,7 +125,7 @@ cdef class Doc: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/doc#get_extension @@ -136,7 +136,7 @@ cdef class Doc: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/doc#has_extension @@ -147,7 +147,7 @@ cdef class Doc: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -473,7 +473,7 @@ cdef class Doc: def text(self): """A unicode representation of the document text. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return "".join(t.text_with_ws for t in self) @@ -482,7 +482,7 @@ cdef class Doc: """An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. - RETURNS (unicode): The original verbatim text of the document. + RETURNS (str): The original verbatim text of the document. """ return self.text @@ -628,7 +628,7 @@ cdef class Doc: @property def lang_(self): - """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" + """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'.""" return self.vocab.lang cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 66e8d8c3e..59323c393 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -33,7 +33,7 @@ cdef class Span: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Span._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -51,7 +51,7 @@ cdef class Span: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/span#get_extension @@ -62,7 +62,7 @@ cdef class Span: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/span#has_extension @@ -73,7 +73,7 @@ cdef class Span: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -501,7 +501,7 @@ cdef class Span: @property def text(self): - """RETURNS (unicode): The original verbatim text of the span.""" + """RETURNS (str): The original verbatim text of the span.""" text = self.text_with_ws if self[-1].whitespace_: text = text[:-1] @@ -512,7 +512,7 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing + RETURNS (str): The text content of the span (with trailing whitespace). """ return "".join([t.text_with_ws for t in self]) @@ -688,7 +688,7 @@ cdef class Span: raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) property ent_id_: - """RETURNS (unicode): The (string) entity ID.""" + """RETURNS (str): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ @@ -700,12 +700,12 @@ cdef class Span: """Verbatim text content (identical to `Span.text`). Exists mostly for consistency with other attributes. - RETURNS (unicode): The span's text.""" + RETURNS (str): The span's text.""" return self.text @property def lemma_(self): - """RETURNS (unicode): The span's lemma.""" + """RETURNS (str): The span's lemma.""" return " ".join([t.lemma_ for t in self]).strip() @property @@ -724,7 +724,7 @@ cdef class Span: return "".join([t.text_with_ws for t in self]) property label_: - """RETURNS (unicode): The span's label.""" + """RETURNS (str): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -734,7 +734,7 @@ cdef class Span: raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) property kb_id_: - """RETURNS (unicode): The named entity's KB ID.""" + """RETURNS (str): The named entity's KB ID.""" def __get__(self): return self.doc.vocab.strings[self.kb_id] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2486ed991..0d1e82322 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -36,7 +36,7 @@ cdef class Token: def set_extension(cls, name, **kwargs): """Define a custom attribute which becomes available as `Token._`. - name (unicode): Name of the attribute to set. + name (str): Name of the attribute to set. default: Optional default value of the attribute. getter (callable): Optional getter function. setter (callable): Optional setter function. @@ -54,7 +54,7 @@ cdef class Token: def get_extension(cls, name): """Look up a previously registered extension by name. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple. DOCS: https://spacy.io/api/token#get_extension @@ -65,7 +65,7 @@ cdef class Token: def has_extension(cls, name): """Check whether an extension has been registered. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (bool): Whether the extension has been registered. DOCS: https://spacy.io/api/token#has_extension @@ -76,7 +76,7 @@ cdef class Token: def remove_extension(cls, name): """Remove a previously registered extension. - name (unicode): Name of the extension. + name (str): Name of the extension. RETURNS (tuple): A `(default, method, getter, setter)` tuple of the removed extension. @@ -244,12 +244,12 @@ cdef class Token: @property def text(self): - """RETURNS (unicode): The original verbatim text of the token.""" + """RETURNS (str): The original verbatim text of the token.""" return self.orth_ @property def text_with_ws(self): - """RETURNS (unicode): The text content of the span (with trailing + """RETURNS (str): The text content of the span (with trailing whitespace). """ cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -740,7 +740,7 @@ cdef class Token: self.c.ent_type = ent_type property ent_type_: - """RETURNS (unicode): Named entity type.""" + """RETURNS (str): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] @@ -763,7 +763,7 @@ cdef class Token: and "" means no entity tag is set. "B" with an empty ent_type means that the token is blocked from further processing by NER. - RETURNS (unicode): IOB code of named entity tag. + RETURNS (str): IOB code of named entity tag. """ iob_strings = ("", "I", "O", "B") return iob_strings[self.c.ent_iob] @@ -779,7 +779,7 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """RETURNS (unicode): ID of the entity the token is an instance of, + """RETURNS (str): ID of the entity the token is an instance of, if any. """ def __get__(self): @@ -797,7 +797,7 @@ cdef class Token: self.c.ent_kb_id = ent_kb_id property ent_kb_id_: - """RETURNS (unicode): Named entity KB ID.""" + """RETURNS (str): Named entity KB ID.""" def __get__(self): return self.vocab.strings[self.c.ent_kb_id] @@ -806,12 +806,12 @@ cdef class Token: @property def whitespace_(self): - """RETURNS (unicode): The trailing whitespace character, if present.""" + """RETURNS (str): The trailing whitespace character, if present.""" return " " if self.c.spacy else "" @property def orth_(self): - """RETURNS (unicode): Verbatim text content (identical to + """RETURNS (str): Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. """ @@ -819,13 +819,13 @@ cdef class Token: @property def lower_(self): - """RETURNS (unicode): The lowercase token text. Equivalent to + """RETURNS (str): The lowercase token text. Equivalent to `Token.text.lower()`. """ return self.vocab.strings[self.c.lex.lower] property norm_: - """RETURNS (unicode): The token's norm, i.e. a normalised form of the + """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ @@ -837,34 +837,34 @@ cdef class Token: @property def shape_(self): - """RETURNS (unicode): Transform of the tokens's string, to show + """RETURNS (str): Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". """ return self.vocab.strings[self.c.lex.shape] @property def prefix_(self): - """RETURNS (unicode): A length-N substring from the start of the token. + """RETURNS (str): A length-N substring from the start of the token. Defaults to `N=1`. """ return self.vocab.strings[self.c.lex.prefix] @property def suffix_(self): - """RETURNS (unicode): A length-N substring from the end of the token. + """RETURNS (str): A length-N substring from the end of the token. Defaults to `N=3`. """ return self.vocab.strings[self.c.lex.suffix] @property def lang_(self): - """RETURNS (unicode): Language of the parent document's vocabulary, + """RETURNS (str): Language of the parent document's vocabulary, e.g. 'en'. """ return self.vocab.strings[self.c.lex.lang] property lemma_: - """RETURNS (unicode): The token lemma, i.e. the base form of the word, + """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ def __get__(self): @@ -877,7 +877,7 @@ cdef class Token: self.c.lemma = self.vocab.strings.add(lemma_) property pos_: - """RETURNS (unicode): Coarse-grained part-of-speech tag.""" + """RETURNS (str): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] @@ -885,7 +885,7 @@ cdef class Token: self.c.pos = parts_of_speech.IDS[pos_name] property tag_: - """RETURNS (unicode): Fine-grained part-of-speech tag.""" + """RETURNS (str): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] @@ -893,7 +893,7 @@ cdef class Token: self.tag = self.vocab.strings.add(tag) property dep_: - """RETURNS (unicode): The syntactic dependency label.""" + """RETURNS (str): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] diff --git a/spacy/util.py b/spacy/util.py index 41af881c9..fc5837755 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -58,7 +58,7 @@ def lang_class_is_loaded(lang): loaded lazily, to avoid expensive setup code associated with the language data. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (bool): Whether a Language class has been loaded. """ return lang in registry.languages @@ -67,7 +67,7 @@ def lang_class_is_loaded(lang): def get_lang_class(lang): """Import and load a Language class. - lang (unicode): Two-letter language code, e.g. 'en'. + lang (str): Two-letter language code, e.g. 'en'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available @@ -85,7 +85,7 @@ def get_lang_class(lang): def set_lang_class(name, cls): """Set a custom Language class name that can be loaded via get_lang_class. - name (unicode): Name of Language class. + name (str): Name of Language class. cls (Language): Language class. """ registry.languages.register(name, func=cls) @@ -107,7 +107,7 @@ def load_language_data(path): """Load JSON language data using the given path as a base. If the provided path isn't present, will attempt to load a gzipped version before giving up. - path (unicode / Path): The data to load. + path (str / Path): The data to load. RETURNS: The loaded data. """ path = ensure_path(path) @@ -128,7 +128,7 @@ def get_module_path(module): def load_model(name, **overrides): """Load a model from a package or data path. - name (unicode): Package name or model path. + name (str): Package name or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. """ @@ -202,7 +202,7 @@ def load_model_from_init_py(init_file, **overrides): """Helper function to use in the `load()` method of a model package's __init__.py. - init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + init_file (str): Path to model's __init__.py, i.e. `__file__`. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with loaded model. """ @@ -227,8 +227,8 @@ def get_package_version(name): """Get the version of an installed package. Typically used to get model package versions. - name (unicode): The name of the installed Python package. - RETURNS (unicode / None): The version or None if package not installed. + name (str): The name of the installed Python package. + RETURNS (str / None): The version or None if package not installed. """ try: return importlib_metadata.version(name) @@ -338,7 +338,7 @@ def get_model_config(path): def is_package(name): """Check if string maps to a package installed via pip. - name (unicode): Name of package. + name (str): Name of package. RETURNS (bool): True if installed package, False if not. """ try: @@ -351,7 +351,7 @@ def is_package(name): def get_package_path(name): """Get the path to an installed package. - name (unicode): Package name. + name (str): Package name. RETURNS (Path): Path to installed package. """ name = name.lower() # use lowercase version to be safe @@ -526,8 +526,8 @@ def expand_exc(excs, search, replace): For example, to add additional versions with typographic apostrophes. excs (dict): Tokenizer exceptions. - search (unicode): String to find and replace. - replace (unicode): Replacement. + search (str): String to find and replace. + replace (str): Replacement. RETURNS (dict): Combined tokenizer exceptions. """ @@ -761,8 +761,8 @@ def from_disk(path, readers, exclude): def import_file(name, loc): """Import module from a file. Used to load models from a directory. - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. + name (str): Name of module to load. + loc (str / Path): Path to the file. RETURNS: The loaded module. """ loc = str(loc) @@ -777,8 +777,8 @@ def minify_html(html): Disclaimer: NOT a general-purpose solution, only removes indentation and newlines. - html (unicode): Markup to minify. - RETURNS (unicode): "Minified" HTML. + html (str): Markup to minify. + RETURNS (str): "Minified" HTML. """ return html.strip().replace(" ", "").replace("\n", "") @@ -787,8 +787,8 @@ def escape_html(text): """Replace <, >, &, " with their HTML encoded representation. Intended to prevent HTML errors in rendered displaCy markup. - text (unicode): The original text. - RETURNS (unicode): Equivalent text to be safely used within HTML. + text (str): The original text. + RETURNS (str): Equivalent text to be safely used within HTML. """ text = text.replace("&", "&") text = text.replace("<", "<") diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index e100ae915..0ed2462c6 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -57,7 +57,7 @@ cdef class Vectors: shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (unicode): A name to identify the vectors table. + name (str): A name to identify the vectors table. RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init @@ -237,7 +237,7 @@ cdef class Vectors: def find(self, *, key=None, keys=None, row=None, rows=None): """Look up one or more keys by row, or vice versa. - key (unicode / int): Find the row that the given key points to. + key (str / int): Find the row that the given key points to. Returns int, -1 if missing. keys (iterable): Find rows that the keys point to. Returns ndarray. @@ -352,7 +352,7 @@ cdef class Vectors: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode / Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exists. DOCS: https://spacy.io/api/vectors#to_disk @@ -372,7 +372,7 @@ cdef class Vectors: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode / Path): Directory path, string or Path-like object. + path (str / Path): Directory path, string or Path-like object. RETURNS (Vectors): The modified object. DOCS: https://spacy.io/api/vectors#from_disk diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a1929559f..ed37f6e98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -41,7 +41,7 @@ cdef class Vocab: strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + name (str): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} @@ -97,7 +97,7 @@ cdef class Vocab: See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag_getter (callable): A function `f(str) -> bool`, to get the flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest @@ -187,7 +187,7 @@ cdef class Vocab: def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. - string (unicode): The ID string. + string (str): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. DOCS: https://spacy.io/api/vocab#contains diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..39148e476 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation. | Name | Type | Description | | -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `vocab` | `Vocab` | The lexeme's vocabulary. | -| `text` | unicode | Verbatim text content. | +| `text` | str | Verbatim text content. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `flags` | int | Container of the lexeme's binary flags. | | `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | +| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `lower` | int | Lowercase form of the word. | -| `lower_` | unicode | Lowercase form of the word. | +| `lower_` | str | Lowercase form of the word. | | `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | -| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | +| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. | | `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. | | `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | | `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | | `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | @@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation. | `is_oov` | bool | Is the lexeme out-of-vocabulary? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | -| `lang_` | unicode | Language of the parent vocabulary. | +| `lang_` | str | Language of the parent vocabulary. | | `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | | `cluster` | int | Brown cluster ID. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..b851f6882 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,7 +27,7 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | -| `vectors_name` 2.2 | unicode | A name to identify the vectors table. | +| `vectors_name` 2.2 | str | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -91,10 +91,10 @@ given string, you need to look it up in > assert oov not in nlp.vocab > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------- | -| `string` | unicode | The ID string. | -| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------- | +| `string` | str | The ID string. | +| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | ## Vocab.add_flag {#add_flag tag="method"} @@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`. | Name | Type | Description | | ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. | +| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. | | `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | | **RETURNS** | int | The integer ID by which the flag value can be checked. | @@ -227,10 +227,10 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 5f47bd2e3..a84399312 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Type |  Description | -| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | unicode | The exact verbatim text of a token. | -| `TEXT` 2.1 | unicode | The exact verbatim text of a token. | -| `LOWER` | unicode | The lowercase form of the token text. | -|  `LENGTH` | int | The length of the token text. | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | -| `ENT_TYPE` | unicode | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| Attribute | Type |  Description | +| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | +| `ORTH` | str | The exact verbatim text of a token. | +| `TEXT` 2.1 | str | The exact verbatim text of a token. | +| `LOWER` | str | The lowercase form of the token text. | +|  `LENGTH` | int | The length of the token text. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | +| `ENT_TYPE` | str | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | @@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} -When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, -the EntityRuler calls the nlp object to construct a doc object. This happens in case you try -to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to -extract matches based on the pattern's POS signature. +When using a large amount of **phrase patterns** (roughly > 10000) it's useful +to understand how the `add_patterns` function of the EntityRuler works. For each +**phrase pattern**, the EntityRuler calls the nlp object to construct a doc +object. This happens in case you try to add the EntityRuler at the end of an +existing pipeline with, for example, a POS tagger and want to extract matches +based on the pattern's POS signature. -In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. +In this case you would pass a config value of `phrase_matcher_attr="POS"` for +the EntityRuler. -Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. +Running the full language pipeline across every pattern in a large list scales +linearly and can therefore take a long time on large amounts of phrase patterns. -As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use +nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with +5,000-100,000 phrase patterns respectively. -Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. +Even with this speedup (but especially if you're using an older version) the +`add_patterns` function can still take a long time. -An easy workaround to make this function run faster is disabling the other language pipes -while adding the phrase patterns. +An easy workaround to make this function run faster is disabling the other +language pipes while adding the phrase patterns. ```python entityruler = EntityRuler(nlp) From 262d306eaa5a8715ca5905c8fde341ba65771d09 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 17:23:00 +0200 Subject: [PATCH 143/187] unicode -> str consistency --- website/docs/api/cli.md | 8 +- website/docs/api/cython-classes.md | 2 +- website/docs/api/dependencyparser.md | 16 +- website/docs/api/doc.md | 50 +++--- website/docs/api/entitylinker.md | 18 +-- website/docs/api/entityrecognizer.md | 16 +- website/docs/api/entityruler.md | 27 ++-- website/docs/api/goldcorpus.md | 10 +- website/docs/api/goldparse.md | 5 +- website/docs/api/kb.md | 170 +++++++++++---------- website/docs/api/language.md | 79 +++++----- website/docs/api/lemmatizer.md | 24 +-- website/docs/api/lookups.md | 50 +++--- website/docs/api/matcher.md | 24 +-- website/docs/api/phrasematcher.md | 16 +- website/docs/api/pipeline-functions.md | 10 +- website/docs/api/sentencizer.md | 14 +- website/docs/api/span.md | 40 ++--- website/docs/api/stringstore.md | 46 +++--- website/docs/api/tagger.md | 26 ++-- website/docs/api/textcategorizer.md | 18 +-- website/docs/api/token.md | 160 +++++++++---------- website/docs/api/tokenizer.md | 88 +++++------ website/docs/api/top-level.md | 126 +++++++-------- website/docs/api/vectors.md | 20 +-- website/docs/usage/linguistic-features.md | 18 +-- website/docs/usage/processing-pipelines.md | 20 +-- website/docs/usage/saving-loading.md | 12 +- website/docs/usage/visualizers.md | 12 +- 29 files changed, 564 insertions(+), 561 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f067ba5a7..d507e13ec 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -504,10 +504,10 @@ tokenization can be provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ------- | ---------------------------------------------------------- | -| `text` | unicode | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| Key | Type | Description | +| -------- | ---- | ---------------------------------------------------------- | +| `text` | str | The raw input text. Is not required if `tokens` available. | +| `tokens` | list | Optional tokenization, one string per token. | ```json ### Example diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 77d6fdd10..9dea04284 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -170,7 +170,7 @@ vocabulary. | Name | Type | Description | | ----------- | ---------------- | ------------------------------------------------------------------------------------------- | | `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | -| `string` | unicode | The string of the word to look up. | +| `string` | str | The string of the word to look up. | | **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index df0df3e38..0980dc2e0 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -229,9 +229,9 @@ Add a new label to the pipe. > parser.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## DependencyParser.to_disk {#to_disk tag="method"} @@ -244,10 +244,10 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index ab85c1deb..75491358d 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -123,7 +123,7 @@ details, see the documentation on | Name | Type | Description | | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -145,10 +145,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Doc.has_extension {#has_extension tag="classmethod" new="2"} @@ -162,10 +162,10 @@ Check whether an extension has been registered on the `Doc` class. > assert Doc.has_extension('has_city') > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -180,10 +180,10 @@ Remove a previously registered extension. > assert not Doc.has_extension('has_city') > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Doc.char_span {#char_span tag="method" new="2"} @@ -368,10 +368,10 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -385,11 +385,11 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -648,15 +648,15 @@ The L2 norm of the document's vector representation. | Name | Type | Description | | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | unicode | A unicode representation of the document text. | -| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `text` | str | A unicode representation of the document text. | +| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `vocab` | `Vocab` | The store of lexical types. | | `tensor` 2 | `ndarray` | Container for dense vector representations. | | `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | -| `lang_` 2.1 | unicode | Language of the document's vocabulary. | +| `lang_` 2.1 | str | Language of the document's vocabulary. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index a9d6a31a5..d7f25ed56 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -258,10 +258,10 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -274,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| ----------- | -------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 9a2766c07..1d0c1de3a 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -230,9 +230,9 @@ Add a new label to the pipe. > ner.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## EntityRecognizer.to_disk {#to_disk tag="method"} @@ -245,10 +245,10 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 0fd24897d..7bee3a77a 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -72,10 +72,10 @@ Whether a label is present in the patterns. > assert not "PERSON" in ruler > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------- | -| `label` | unicode | The label to check. | -| **RETURNS** | bool | Whether the entity ruler contains the label. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------- | +| `label` | str | The label to check. | +| **RETURNS** | bool | Whether the entity ruler contains the label. | ## EntityRuler.\_\_call\_\_ {#call tag="method"} @@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer -patterns over shorter, and if equal the match occuring first in the Doc is chosen. +with the matches. When matches overlap in a Doc, the entity ruler prioritizes +longer patterns over shorter, and if equal the match occuring first in the Doc +is chosen. > #### Example > @@ -139,9 +140,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a > ruler.to_disk("/path/to/entity_ruler") # saves patterns and config > ``` -| Name | Type | Description | -| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## EntityRuler.from_disk {#from_disk tag="method"} @@ -158,10 +159,10 @@ configuration. > ruler.from_disk("/path/to/entity_ruler") # loads patterns and config > ``` -| Name | Type | Description | -| ----------- | ---------------- | ---------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | +| Name | Type | Description | +| ----------- | ------------- | ---------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | ## EntityRuler.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md index a18ef4d32..7767b28bd 100644 --- a/website/docs/api/goldcorpus.md +++ b/website/docs/api/goldcorpus.md @@ -17,8 +17,8 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a [`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) for further details. -| Name | Type | Description | -| ----------- | --------------------------- | ------------------------------------------------------------ | -| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. | -| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. | -| **RETURNS** | `GoldCorpus` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ----------------------- | ------------------------------------------------------------ | +| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. | +| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. | +| **RETURNS** | `GoldCorpus` | The newly constructed object. | diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 1ef6f0362..2f841eedd 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -62,7 +62,8 @@ Whether the provided syntactic annotations form a projective dependency tree. Convert a list of Doc objects into the [JSON-serializable format](/api/annotation#json-input) used by the -[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. > #### Example > @@ -160,7 +161,7 @@ single-token entity. | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | | `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | -| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. | +| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index eeba85e84..f088815fd 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -1,16 +1,19 @@ --- title: KnowledgeBase -teaser: A storage class for entities and aliases of a specific knowledge base (ontology) +teaser: + A storage class for entities and aliases of a specific knowledge base + (ontology) tag: class source: spacy/kb.pyx new: 2.2 --- -The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init) -objects, which are plausible external identifiers given a certain textual mention. -Each such `Candidate` holds information from the relevant KB entities, -such as its frequency in text and possible aliases. -Each entity in the knowledge base also has a pretrained entity vector of a fixed size. +The `KnowledgeBase` object provides a method to generate +[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external +identifiers given a certain textual mention. Each such `Candidate` holds +information from the relevant KB entities, such as its frequency in text and +possible aliases. Each entity in the knowledge base also has a pretrained entity +vector of a fixed size. ## KnowledgeBase.\_\_init\_\_ {#init tag="method"} @@ -24,25 +27,25 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ----------------------- | ---------------- | ----------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | -| **RETURNS** | `KnowledgeBase` | The newly constructed object. | - +| Name | Type | Description | +| ---------------------- | --------------- | ---------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | +| **RETURNS** | `KnowledgeBase` | The newly constructed object. | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} The length of the fixed-size entity vectors in the knowledge base. -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------- | -| **RETURNS** | int | Length of the fixed-size entity vectors. | +| Name | Type | Description | +| ----------- | ---- | ---------------------------------------- | +| **RETURNS** | int | Length of the fixed-size entity vectors. | ## KnowledgeBase.add_entity {#add_entity tag="method"} -Add an entity to the knowledge base, specifying its corpus frequency -and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length). +Add an entity to the knowledge base, specifying its corpus frequency and entity +vector, which should be of length +[`entity_vector_length`](/api/kb#entity_vector_length). > #### Example > @@ -51,16 +54,16 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en > kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2) > ``` -| Name | Type | Description | -| --------------- | ------------- | ------------------------------------------------- | -| `entity` | unicode | The unique entity identifier | -| `freq` | float | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Type | Description | +| --------------- | ------ | ----------------------------------------------- | +| `entity` | str | The unique entity identifier | +| `freq` | float | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pretrained vector of the entity | ## KnowledgeBase.set_entities {#set_entities tag="method"} -Define the full list of entities in the knowledge base, specifying the corpus frequency -and entity vector for each entity. +Define the full list of entities in the knowledge base, specifying the corpus +frequency and entity vector for each entity. > #### Example > @@ -68,18 +71,19 @@ and entity vector for each entity. > kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2]) > ``` -| Name | Type | Description | -| ------------- | ------------- | ------------------------------------------------- | -| `entity_list` | iterable | List of unique entity identifiers | -| `freq_list` | iterable | List of entity frequencies | -| `vector_list` | iterable | List of entity vectors | +| Name | Type | Description | +| ------------- | -------- | --------------------------------- | +| `entity_list` | iterable | List of unique entity identifiers | +| `freq_list` | iterable | List of entity frequencies | +| `vector_list` | iterable | List of entity vectors | ## KnowledgeBase.add_alias {#add_alias tag="method"} -Add an alias or mention to the knowledge base, specifying its potential KB identifiers -and their prior probabilities. The entity identifiers should refer to entities previously -added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). -The sum of the prior probabilities should not exceed 1. +Add an alias or mention to the knowledge base, specifying its potential KB +identifiers and their prior probabilities. The entity identifiers should refer +to entities previously added with [`add_entity`](/api/kb#add_entity) or +[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities +should not exceed 1. > #### Example > @@ -87,11 +91,11 @@ The sum of the prior probabilities should not exceed 1. > kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3]) > ``` -| Name | Type | Description | -| -------------- | ------------- | -------------------------------------------------- | -| `alias` | unicode | The textual mention or alias | -| `entities` | iterable | The potential entities that the alias may refer to | -| `probabilities`| iterable | The prior probabilities of each entity | +| Name | Type | Description | +| --------------- | -------- | -------------------------------------------------- | +| `alias` | str | The textual mention or alias | +| `entities` | iterable | The potential entities that the alias may refer to | +| `probabilities` | iterable | The prior probabilities of each entity | ## KnowledgeBase.\_\_len\_\_ {#len tag="method"} @@ -117,9 +121,9 @@ Get a list of all entity IDs in the knowledge base. > all_entities = kb.get_entity_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | list | The list of entities in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------- | +| **RETURNS** | list | The list of entities in the knowledge base. | ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} @@ -131,9 +135,9 @@ Get the total number of aliases in the knowledge base. > total_aliases = kb.get_size_aliases() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | int | The number of aliases in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------- | +| **RETURNS** | int | The number of aliases in the knowledge base. | ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} @@ -145,9 +149,9 @@ Get a list of all aliases in the knowledge base. > all_aliases = kb.get_alias_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | list | The list of aliases in the knowledge base. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| **RETURNS** | list | The list of aliases in the knowledge base. | ## KnowledgeBase.get_candidates {#get_candidates tag="method"} @@ -160,10 +164,10 @@ of type [`Candidate`](/api/kb/#candidate_init). > candidates = kb.get_candidates("Douglas") > ``` -| Name | Type | Description | -| ------------- | ------------- | -------------------------------------------------- | -| `alias` | unicode | The textual mention or alias | -| **RETURNS** | iterable | The list of relevant `Candidate` objects | +| Name | Type | Description | +| ----------- | -------- | ---------------------------------------- | +| `alias` | str | The textual mention or alias | +| **RETURNS** | iterable | The list of relevant `Candidate` objects | ## KnowledgeBase.get_vector {#get_vector tag="method"} @@ -175,15 +179,15 @@ Given a certain entity ID, retrieve its pretrained entity vector. > vector = kb.get_vector("Q42") > ``` -| Name | Type | Description | -| ------------- | ------------- | -------------------------------------------------- | -| `entity` | unicode | The entity ID | -| **RETURNS** | vector | The entity vector | +| Name | Type | Description | +| ----------- | ------ | ----------------- | +| `entity` | str | The entity ID | +| **RETURNS** | vector | The entity vector | ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} -Given a certain entity ID and a certain textual mention, retrieve -the prior probability of the fact that the mention links to the entity ID. +Given a certain entity ID and a certain textual mention, retrieve the prior +probability of the fact that the mention links to the entity ID. > #### Example > @@ -191,11 +195,11 @@ the prior probability of the fact that the mention links to the entity ID. > probability = kb.get_prior_prob("Q42", "Douglas") > ``` -| Name | Type | Description | -| ------------- | ------------- | --------------------------------------------------------------- | -| `entity` | unicode | The entity ID | -| `alias` | unicode | The textual mention or alias | -| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------------------- | +| `entity` | str | The entity ID | +| `alias` | str | The textual mention or alias | +| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | ## KnowledgeBase.dump {#dump tag="method"} @@ -207,14 +211,14 @@ Save the current state of the knowledge base to a directory. > kb.dump(loc) > ``` -| Name | Type | Description | -| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ | -| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## KnowledgeBase.load_bulk {#load_bulk tag="method"} -Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) -should also be the same as the one used to create the KB. +Restore the state of the knowledge base from a given directory. Note that the +[`Vocab`](/api/vocab) should also be the same as the one used to create the KB. > #### Example > @@ -226,18 +230,16 @@ should also be the same as the one used to create the KB. > kb.load_bulk("/path/to/kb") > ``` - -| Name | Type | Description | -| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | -| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | - +| Name | Type | Description | +| ----------- | --------------- | -------------------------------------------------------------------------- | +| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | ## Candidate.\_\_init\_\_ {#candidate_init tag="method"} Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method -of a `KnowledgeBase`. +but instead these objects are returned by the +[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`. > #### Example > @@ -257,12 +259,12 @@ of a `KnowledgeBase`. ## Candidate attributes {#candidate_attributes} -| Name | Type | Description | -| ---------------------- | ------------ | ------------------------------------------------------------------ | -| `entity` | int | The entity's unique KB identifier | -| `entity_` | unicode | The entity's unique KB identifier | -| `alias` | int | The alias or textual mention | -| `alias_` | unicode | The alias or textual mention | -| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | -| `entity_freq` | long | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Type | Description | +| --------------- | ------ | -------------------------------------------------------------- | +| `entity` | int | The entity's unique KB identifier | +| `entity_` | str | The entity's unique KB identifier | +| `alias` | int | The alias or textual mention | +| `alias_` | str | The alias or textual mention | +| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | +| `entity_freq` | long | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pretrained vector of the entity | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 703a0f678..496c89776 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -49,11 +49,11 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------------------- | -| `text` | unicode | The text to be processed. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Doc` | A container for accessing the annotations. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------------------- | +| `text` | str | The text to be processed. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Doc` | A container for accessing the annotations. | @@ -201,7 +201,7 @@ Create a pipeline component from a factory. | Name | Type | Description | | ----------- | -------- | ---------------------------------------------------------------------------------- | -| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | +| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | | `config` | dict | Configuration parameters to initialize component. | | **RETURNS** | callable | The pipeline component. | @@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`, | Name | Type | Description | | ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `component` | callable | The pipeline component. | -| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | -| `before` | unicode | Component name to insert component directly before. | -| `after` | unicode | Component name to insert component directly after: | +| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | +| `before` | str | Component name to insert component directly before. | +| `after` | str | Component name to insert component directly after: | | `first` | bool | Insert component first / not first in the pipeline. | | `last` | bool | Insert component last / not last in the pipeline. | @@ -243,10 +243,10 @@ Check whether a component is present in the pipeline. Equivalent to > assert nlp.has_pipe("component") > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| `name` | unicode | Name of the pipeline component to check. | -| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------- | +| `name` | str | Name of the pipeline component to check. | +| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | ## Language.get_pipe {#get_pipe tag="method" new="2"} @@ -261,7 +261,7 @@ Get a pipeline component for a given component name. | Name | Type | Description | | ----------- | -------- | -------------------------------------- | -| `name` | unicode | Name of the pipeline component to get. | +| `name` | str | Name of the pipeline component to get. | | **RETURNS** | callable | The pipeline component. | ## Language.replace_pipe {#replace_pipe tag="method" new="2"} @@ -276,7 +276,7 @@ Replace a component in the pipeline. | Name | Type | Description | | ----------- | -------- | --------------------------------- | -| `name` | unicode | Name of the component to replace. | +| `name` | str | Name of the component to replace. | | `component` | callable | The pipeline component to insert. | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -292,10 +292,10 @@ added to the pipeline, you can also use the `name` argument on > nlp.rename_pipe("parser", "spacy_parser") > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------------------- | -| `old_name` | unicode | Name of the component to rename. | -| `new_name` | unicode | New name of the component. | +| Name | Type | Description | +| ---------- | ---- | -------------------------------- | +| `old_name` | str | Name of the component to rename. | +| `new_name` | str | New name of the component. | ## Language.remove_pipe {#remove_pipe tag="method" new="2"} @@ -309,10 +309,10 @@ component function. > assert name == "parser" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `name` | unicode | Name of the component to remove. | -| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | +| Name | Type | Description | +| ----------- | ----- | ----------------------------------------------------- | +| `name` | str | Name of the component to remove. | +| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} @@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled. | Name | Type | Description | | ----------- | --------------- | ------------------------------------------------------------------------------------ | | `disable` | list | Names of pipeline components to disable. | -| `disable` | unicode | Name of pipeline component to disable. | +| `disable` | str | Name of pipeline component to disable. | | `enable` | list | Names of pipeline components that will not be disabled. | -| `enable` | unicode | Name of pipeline component that will not be disabled. | +| `enable` | str | Name of pipeline component that will not be disabled. | | **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | - As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: @@ -370,10 +369,10 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -395,11 +394,11 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| ----------- | ------------ | ----------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | @@ -480,11 +479,11 @@ per component. ## Class attributes {#class-attributes} -| Name | Type | Description | -| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | -| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | -| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | +| Name | Type | Description | +| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | +| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | +| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f43e17fd3..16cd624f5 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -63,8 +63,8 @@ Lemmatize a string. | Name | Type | Description | | ------------ | ------------- | -------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to lemmatize, e.g. the token text. | -| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | +| `string` | str | The string to lemmatize, e.g. the token text. | +| `univ_pos` | str / int | The token's universal part-of-speech tag. | | `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. | | **RETURNS** | list | The available lemmas for the string. | @@ -82,11 +82,11 @@ original string is returned. Languages can provide a > assert lemmatizer.lookup("going") == "go" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to look up. | -| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | -| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- | +| `string` | str | The string to look up. | +| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | +| **RETURNS** | str | The lemma if the string was found, otherwise the original string. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} @@ -102,11 +102,11 @@ lemmatization entirely. > assert is_base_form == True > ``` -| Name | Type | Description | -| ------------ | ------------- | --------------------------------------------------------------------------------------- | -| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | -| `morphology` | dict | The token's morphological features. | -| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | +| Name | Type | Description | +| ------------ | --------- | --------------------------------------------------------------------------------------- | +| `univ_pos` | str / int | The token's universal part-of-speech tag. | +| `morphology` | dict | The token's morphological features. | +| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | ## Attributes {#attributes} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index bd3b38303..b91d92646 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -56,10 +56,10 @@ Check if the lookups contain a table of a given name. Delegates to > assert "some_table" in lookups > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------- | -| `name` | unicode | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------- | +| `name` | str | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | ## Lookups.tables {#tables tag="property"} @@ -91,7 +91,7 @@ exists. | Name | Type | Description | | ----------- | ----------------------------- | ---------------------------------- | -| `name` | unicode | Unique name of the table. | +| `name` | str | Unique name of the table. | | `data` | dict | Optional data to add to the table. | | **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | @@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist. | Name | Type | Description | | ----------- | ----------------------------- | ------------------ | -| `name` | unicode | Name of the table. | +| `name` | str | Name of the table. | | **RETURNS** | [`Table`](/api/lookups#table) | The table. | ## Lookups.remove_table {#remove_table tag="method"} @@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist. | Name | Type | Description | | ----------- | ----------------------------- | ---------------------------- | -| `name` | unicode | Name of the table to remove. | +| `name` | str | Name of the table to remove. | | **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | ## Lookups.has_table {#has_table tag="method"} @@ -144,10 +144,10 @@ Check if the lookups contain a table of a given name. Equivalent to > assert lookups.has_table("some_table") > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------- | -| `name` | unicode | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------- | +| `name` | str | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | ## Lookups.to_bytes {#to_bytes tag="method"} @@ -191,9 +191,9 @@ which will be created if it doesn't exist. > lookups.to_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Lookups.from_disk {#from_disk tag="method"} @@ -208,10 +208,10 @@ the file doesn't exist. > lookups.from_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Lookups` | The loaded lookups. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Lookups` | The loaded lookups. | ## Table {#table tag="class, ordererddict"} @@ -238,7 +238,7 @@ Initialize a new table. | Name | Type | Description | | ----------- | ------- | ---------------------------------- | -| `name` | unicode | Optional table name for reference. | +| `name` | str | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | ### Table.from_dict {#table.from_dict tag="classmethod"} @@ -256,7 +256,7 @@ Initialize a new table from a dict. | Name | Type | Description | | ----------- | ------- | ---------------------------------- | | `data` | dict | The dictionary. | -| `name` | unicode | Optional table name for reference. | +| `name` | str | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | ### Table.set {#table.set tag="method"} @@ -273,10 +273,10 @@ Set a new key / value pair. String keys will be hashed. Same as > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------- | ------------- | ----------- | -| `key` | unicode / int | The key. | -| `value` | - | The value. | +| Name | Type | Description | +| ------- | --------- | ----------- | +| `key` | str / int | The key. | +| `value` | - | The value. | ### Table.to_bytes {#table.to_bytes tag="method"} @@ -313,6 +313,6 @@ Load a table from a bytestring. | Name | Type | Description | | -------------- | --------------------------- | ----------------------------------------------------- | -| `name` | unicode | Table name. | +| `name` | str | Table name. | | `default_size` | int | Default size of bloom filters if no data is provided. | | `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index bfd4fb0ec..8a872558c 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -125,10 +125,10 @@ Check whether the matcher contains rules for a match ID. > assert 'Rule' in matcher > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `key` | unicode | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | ## Matcher.add {#add tag="method" new="2"} @@ -153,7 +153,7 @@ overwritten. | Name | Type | Description | | ----------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | unicode | An ID for the thing you're matching. | +| `match_id` | str | An ID for the thing you're matching. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | @@ -188,9 +188,9 @@ exist. > assert "Rule" not in matcher > ``` -| Name | Type | Description | -| ----- | ------- | ------------------------- | -| `key` | unicode | The ID of the match rule. | +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | ## Matcher.get {#get tag="method" new="2"} @@ -204,7 +204,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > on_match, patterns = matcher.get("Rule") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------- | -| `key` | unicode | The ID of the match rule. | -| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------- | +| `key` | str | The ID of the match rule. | +| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index a72277420..fa6729f41 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -133,10 +133,10 @@ Check whether the matcher contains rules for a match ID. > assert "OBAMA" in matcher > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| `key` | unicode | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | ## PhraseMatcher.add {#add tag="method"} @@ -162,7 +162,7 @@ overwritten. | Name | Type | Description | | ---------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | unicode | An ID for the thing you're matching. | +| `match_id` | str | An ID for the thing you're matching. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*docs` | `Doc` | `Doc` objects of the phrases to match. | @@ -198,6 +198,6 @@ does not exist. > assert "OBAMA" not in matcher > ``` -| Name | Type | Description | -| ----- | ------- | ------------------------- | -| `key` | unicode | The ID of the match rule. | +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 6e2b473b1..fc417845c 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -112,8 +112,8 @@ end of the pipeline and after all other components. -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. | -| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------ | +| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | +| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. | +| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index c9b935f22..03e843fcc 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -81,9 +81,9 @@ a file `sentencizer.json`. This also happens automatically when you save an > sentencizer.to_disk("/path/to/sentencizer.jsonl") > ``` -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Sentencizer.from_disk {#from_disk tag="method"} @@ -98,10 +98,10 @@ added to its pipeline. > sentencizer.from_disk("/path/to/sentencizer.json") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | +| Name | Type | Description | +| ----------- | ------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | ## Sentencizer.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 3833bbca9..c41d9aa03 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -110,7 +110,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -132,10 +132,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Span.has_extension {#has_extension tag="classmethod" new="2"} @@ -149,10 +149,10 @@ Check whether an extension has been registered on the `Span` class. > assert Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -167,10 +167,10 @@ Remove a previously registered extension. > assert not Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Span.char_span {#char_span tag="method" new="2.2.4"} @@ -497,16 +497,16 @@ The L2 norm of the span's vector representation. | `end` | int | The token offset for the end of the span. | | `start_char` | int | The character offset for the start of the span. | | `end_char` | int | The character offset for the end of the span. | -| `text` | unicode | A unicode representation of the span text. | -| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. | +| `text` | str | A unicode representation of the span text. | +| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | | `label` | int | The hash value of the span's label. | -| `label_` | unicode | The span's label. | -| `lemma_` | unicode | The span's lemma. | +| `label_` | str | The span's label. | +| `lemma_` | str | The span's lemma. | | `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | -| `kb_id_` | unicode | The knowledge base ID referred to by the span. | +| `kb_id_` | str | The knowledge base ID referred to by the span. | | `ent_id` | int | The hash value of the named entity the token is an instance of. | -| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. | +| `ent_id_` | str | The string ID of the named entity the token is an instance of. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index 268f19125..922174c78 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa. | Name | Type | Description | | -------------- | ------------------------ | -------------------------- | | `string_or_id` | bytes, unicode or uint64 | The value to encode. | -| **RETURNS** | unicode or int | The value to be retrieved. | +| **RETURNS** | str or int | The value to be retrieved. | ## StringStore.\_\_contains\_\_ {#contains tag="method"} @@ -69,10 +69,10 @@ Check whether a string is in the store. > assert not "cherry" in stringstore > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------- | -| `string` | unicode | The string to check. | -| **RETURNS** | bool | Whether the store contains the string. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------- | +| `string` | str | The string to check. | +| **RETURNS** | bool | Whether the store contains the string. | ## StringStore.\_\_iter\_\_ {#iter tag="method"} @@ -87,9 +87,9 @@ store will always include an empty string `''` at position `0`. > assert all_strings == ["apple", "orange"] > ``` -| Name | Type | Description | -| ---------- | ------- | ---------------------- | -| **YIELDS** | unicode | A string in the store. | +| Name | Type | Description | +| ---------- | ---- | ---------------------- | +| **YIELDS** | str | A string in the store. | ## StringStore.add {#add tag="method" new="2"} @@ -106,10 +106,10 @@ Add a string to the `StringStore`. > assert stringstore["banana"] == banana_hash > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------ | -| `string` | unicode | The string to add. | -| **RETURNS** | uint64 | The string's hash value. | +| Name | Type | Description | +| ----------- | ------ | ------------------------ | +| `string` | str | The string to add. | +| **RETURNS** | uint64 | The string's hash value. | ## StringStore.to_disk {#to_disk tag="method" new="2"} @@ -121,9 +121,9 @@ Save the current state to a directory. > stringstore.to_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## StringStore.from_disk {#from_disk tag="method" new="2"} @@ -136,10 +136,10 @@ Loads state from a directory. Modifies the object in place and returns it. > stringstore = StringStore().from_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `StringStore` | The modified `StringStore` object. | +| Name | Type | Description | +| ----------- | ------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `StringStore` | The modified `StringStore` object. | ## StringStore.to_bytes {#to_bytes tag="method"} @@ -185,7 +185,7 @@ Get a 64-bit hash for a given string. > assert hash_string("apple") == 8566208034543834098 > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------- | -| `string` | unicode | The string to hash. | -| **RETURNS** | uint64 | The hash. | +| Name | Type | Description | +| ----------- | ------ | ------------------- | +| `string` | str | The string to hash. | +| **RETURNS** | uint64 | The hash. | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index bd3382f89..f14da3ac5 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -229,10 +229,10 @@ Add a new label to the pipe. > tagger.add_label("MY_LABEL", {POS: 'NOUN'}) > ``` -| Name | Type | Description | -| -------- | ------- | --------------------------------------------------------------- | -| `label` | unicode | The label to add. | -| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | +| Name | Type | Description | +| -------- | ---- | --------------------------------------------------------------- | +| `label` | str | The label to add. | +| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | ## Tagger.to_disk {#to_disk tag="method"} @@ -245,10 +245,10 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -261,11 +261,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 1a0280265..dc1c083ac 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. | -| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | +| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | | **RETURNS** | `TextCategorizer` | The newly constructed object. | ### Architectures {#architectures new="2.1"} @@ -247,9 +247,9 @@ Add a new label to the pipe. > textcat.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ------- | ------- | ----------------- | -| `label` | unicode | The label to add. | +| Name | Type | Description | +| ------- | ---- | ----------------- | +| `label` | str | The label to add. | ## TextCategorizer.to_disk {#to_disk tag="method"} @@ -262,10 +262,10 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | diff --git a/website/docs/api/token.md b/website/docs/api/token.md index c30c01c20..1accbe062 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -58,7 +58,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -80,10 +80,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Token.has_extension {#has_extension tag="classmethod" new="2"} @@ -97,10 +97,10 @@ Check whether an extension has been registered on the `Token` class. > assert Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""} @@ -115,10 +115,10 @@ Remove a previously registered extension. > assert not Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Token.check_flag {#check_flag tag="method"} @@ -408,71 +408,71 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | unicode | Verbatim text content. | -| `text_with_ws` | unicode | Text content, with trailing space character if present. | -| `whitespace_` | unicode | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| Name | Type | Description | +| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | str | Verbatim text content. | +| `text_with_ws` | str | Text content, with trailing space character if present. | +| `whitespace_` | str | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | str | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | str | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech. | -| `pos_` | unicode | Coarse-grained part-of-speech. | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | unicode | Fine-grained part-of-speech. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | unicode | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | unicode | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech. | +| `pos_` | str | Coarse-grained part-of-speech. | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | str | Fine-grained part-of-speech. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | str | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | str | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..c71f849ad 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,15 +34,15 @@ the > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` -| Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | ----------- | ------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -55,10 +55,10 @@ Tokenize a string. > assert len(tokens) == 4 > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------- | -| `string` | unicode | The string to tokenize. | -| **RETURNS** | `Doc` | A container for linguistic annotations. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------- | +| `string` | str | The string to tokenize. | +| **RETURNS** | `Doc` | A container for linguistic annotations. | ## Tokenizer.pipe {#pipe tag="method"} @@ -82,20 +82,20 @@ Tokenize a stream of texts. Find internal split points of the string. -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | -| `string` | unicode | The string to split. | -| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `string` | str | The string to split. | +| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | ## Tokenizer.find_prefix {#find_prefix tag="method"} Find the length of a prefix that should be segmented from the string, or `None` if no prefix rules match. -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------ | -| `string` | unicode | The string to segment. | -| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------ | +| `string` | str | The string to segment. | +| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | ## Tokenizer.find_suffix {#find_suffix tag="method"} @@ -104,7 +104,7 @@ if no suffix rules match. | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------ | -| `string` | unicode | The string to segment. | +| `string` | str | The string to segment. | | **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. | ## Tokenizer.add_special_case {#add_special_case tag="method"} @@ -125,7 +125,7 @@ and examples. | Name | Type | Description | | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `string` | unicode | The string to specially tokenize. | +| `string` | str | The string to specially tokenize. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | ## Tokenizer.explain {#explain tag="method"} @@ -142,10 +142,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens. > assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] > ``` -| Name | Type | Description | -| ------------| -------- | --------------------------------------------------- | -| `string` | unicode | The string to tokenize with the debugging tokenizer | -| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------------- | +| `string` | str | The string to tokenize with the debugging tokenizer | +| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | ## Tokenizer.to_disk {#to_disk tag="method"} @@ -158,10 +158,10 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -174,11 +174,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -217,14 +217,14 @@ it. ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | -| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | -| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | +| Name | Type | Description | +| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | +| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | +| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. | +| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 2360ad472..bdd094021 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -32,11 +32,11 @@ class. The data will be loaded in via > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------------------------- | -| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Language` | A `Language` object with the loaded model. | +| Name | Type | Description | +| ----------- | ------------ | --------------------------------------------------------------------------------- | +| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| **RETURNS** | `Language` | A `Language` object with the loaded model. | Essentially, `spacy.load()` is a convenience wrapper that reads the language ID and pipeline components from a model's `meta.json`, initializes the `Language` @@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of | Name | Type | Description | | ----------- | ---------- | ------------------------------------------------------------------------------------------------ | -| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | +| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | @@ -98,10 +98,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > spacy.info("de", markdown=True) > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------------------------------- | -| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). | -| `markdown` | bool | Print information as Markdown. | +| Name | Type | Description | +| ---------- | ---- | ------------------------------------------------------------- | +| `model` | str | A model, i.e. shortcut link, package name or path (optional). | +| `markdown` | bool | Print information as Markdown. | ### spacy.explain {#spacy.explain tag="function"} @@ -122,10 +122,10 @@ list of available terms, see > # world NN noun, singular or mass > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| `term` | unicode | Term to explain. | -| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------------------------- | +| `term` | str | Term to explain. | +| **RETURNS** | str | The explanation, or `None` if not found in the glossary. | ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"} @@ -189,13 +189,13 @@ browser. Will run a simple web server. | Name | Type | Description | Default | | --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | +| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `True` | | `minify` | bool | Minify HTML markup. | `False` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `port` | int | Port to serve visualization. | `5000` | -| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | +| `host` | str | Host to serve visualization. | `'0.0.0.0'` | ### displacy.render {#displacy.render tag="method" new="2"} @@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization. | Name | Type | Description | Default | | ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | +| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `False` | | `minify` | bool | Minify HTML markup. | `False` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | -| **RETURNS** | unicode | Rendered HTML markup. | +| **RETURNS** | str | Rendered HTML markup. | ### Visualizer options {#displacy_options} @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | str | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} @@ -263,11 +263,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Type | Description | Default | -| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | -| `ents` | list | Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | -| `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | +| Name | Type | Description | Default | +| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | +| `ents` | list | Entity types to highlight (`None` for all types). | `None` | +| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | By default, displaCy comes with colors for all [entity types supported by spaCy](/api/annotation#named-entities). If you're @@ -308,9 +308,9 @@ Set custom path to the data directory where spaCy looks for models. > # PosixPath('/custom/path') > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------- | -| `path` | unicode / `Path` | Path to new data directory. | +| Name | Type | Description | +| ------ | ------------ | --------------------------- | +| `path` | str / `Path` | Path to new data directory. | ### util.get_lang_class {#util.get_lang_class tag="function"} @@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper. | Name | Type | Description | | ----------- | ---------- | -------------------------------------- | -| `lang` | unicode | Two-letter language code, e.g. `'en'`. | +| `lang` | str | Two-letter language code, e.g. `'en'`. | | **RETURNS** | `Language` | Language class. | ### util.set_lang_class {#util.set_lang_class tag="function"} @@ -352,7 +352,7 @@ the two-letter language code. | Name | Type | Description | | ------ | ---------- | -------------------------------------- | -| `name` | unicode | Two-letter language code, e.g. `'en'`. | +| `name` | str | Two-letter language code, e.g. `'en'`. | | `cls` | `Language` | The language class, e.g. `English`. | ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} @@ -368,10 +368,10 @@ loaded lazily, to avoid expensive setup code associated with the language data. > assert util.lang_class_is_loaded("de") is False > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------- | -| `name` | unicode | Two-letter language code, e.g. `'en'`. | -| **RETURNS** | bool | Whether the class has been loaded. | +| Name | Type | Description | +| ----------- | ---- | -------------------------------------- | +| `name` | str | Two-letter language code, e.g. `'en'`. | +| **RETURNS** | bool | Whether the class has been loaded. | ### util.load_model {#util.load_model tag="function" new="2"} @@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk). | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `name` | unicode | Package name, shortcut link or model path. | +| `name` | str | Package name, shortcut link or model path. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet. | Name | Type | Description | | ------------- | ---------- | ---------------------------------------------------------------------------------------------------- | -| `model_path` | unicode | Path to model data directory. | +| `model_path` | str | Path to model data directory. | | `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. | +| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | @@ -446,10 +446,10 @@ Get a model's meta.json from a directory path and validate its contents. > meta = util.get_model_meta("/path/to/model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ------------------------ | -| `path` | unicode / `Path` | Path to model directory. | -| **RETURNS** | dict | The model's meta data. | +| Name | Type | Description | +| ----------- | ------------ | ------------------------ | +| `path` | str / `Path` | Path to model directory. | +| **RETURNS** | dict | The model's meta data. | ### util.is_package {#util.is_package tag="function"} @@ -463,10 +463,10 @@ Check if string maps to a package installed via pip. Mainly used to validate > util.is_package("xyz") # False > ``` -| Name | Type | Description | -| ----------- | ------- | -------------------------------------------- | -| `name` | unicode | Name of package. | -| **RETURNS** | `bool` | `True` if installed package, `False` if not. | +| Name | Type | Description | +| ----------- | ------ | -------------------------------------------- | +| `name` | str | Name of package. | +| **RETURNS** | `bool` | `True` if installed package, `False` if not. | ### util.get_package_path {#util.get_package_path tag="function" new="2"} @@ -480,10 +480,10 @@ Get path to an installed package. Mainly used to resolve the location of > # /usr/lib/python3.6/site-packages/en_core_web_sm > ``` -| Name | Type | Description | -| -------------- | ------- | -------------------------------- | -| `package_name` | unicode | Name of installed package. | -| **RETURNS** | `Path` | Path to model package directory. | +| Name | Type | Description | +| -------------- | ------ | -------------------------------- | +| `package_name` | str | Name of installed package. | +| **RETURNS** | `Path` | Path to model package directory. | ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 93e747c1e..d4c0269ef 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -35,7 +35,7 @@ you can add vectors to later. | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `keys` | iterable | A sequence of keys aligned with the data. | | `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. | -| `name` | unicode | A name to identify the vectors table. | +| `name` | str | A name to identify the vectors table. | | **RETURNS** | `Vectors` | The newly created object. | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} @@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the | Name | Type | Description | | ----------- | ---------------------------------- | ----------------------------------------------------- | -| `key` | unicode / int | The key to add. | +| `key` | str / int | The key to add. | | `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. | | `row` | int | An optional row number of a vector to map the key to. | | **RETURNS** | int | The row the vector was added to. | @@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa. | Name | Type | Description | | ----------- | ------------------------------------- | ------------------------------------------------------------------------ | -| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. | +| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. | | `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. | | `row` | int | Find the first key that points to the row. Returns int. | | `rows` | iterable | Find the keys that point to the rows. Returns ndarray. | @@ -337,9 +337,9 @@ Save the current state to a directory. > > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Vectors.from_disk {#from_disk tag="method"} @@ -352,10 +352,10 @@ Loads state from a directory. Modifies the object in place and returns it. > vectors.from_disk("/path/to/vectors") > ``` -| Name | Type | Description | -| ----------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Vectors` | The modified `Vectors` object. | +| Name | Type | Description | +| ----------- | ------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Vectors` | The modified `Vectors` object. | ## Vectors.to_bytes {#to_bytes tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..420e8263a 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy).. ### Disabling the parser {#disabling} In the [default models](/models), the parser is loaded and enabled as part of -the [standard processing pipeline](/usage/processing-pipelines). If you don't need -any of the syntactic information, you should disable the parser. Disabling the -parser will make spaCy load and run much faster. If you want to load the parser, -but need to disable it for specific documents, you can also control its use on -the `nlp` object. +the [standard processing pipeline](/usage/processing-pipelines). If you don't +need any of the syntactic information, you should disable the parser. Disabling +the parser will make spaCy load and run much faster. If you want to load the +parser, but need to disable it for specific documents, you can also control its +use on the `nlp` object. ```python nlp = spacy.load("en_core_web_sm", disable=["parser"]) @@ -990,10 +990,10 @@ nlp = spacy.load("en_core_web_sm") nlp.tokenizer = my_tokenizer ``` -| Argument | Type | Description | -| ----------- | ------- | ------------------------- | -| `text` | unicode | The raw text to tokenize. | -| **RETURNS** | `Doc` | The tokenized document. | +| Argument | Type | Description | +| ----------- | ----- | ------------------------- | +| `text` | str | The raw text to tokenize. | +| **RETURNS** | `Doc` | The tokenized document. | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 696e11106..e7aca3981 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -272,16 +272,16 @@ doc = nlp("I won't have named entities") disabled.restore() ``` -If you want to disable all pipes except for one or a few, you can use the `enable` -keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string -defining just one pipe. +If you want to disable all pipes except for one or a few, you can use the +`enable` keyword. Just like the `disable` keyword, it takes a list of pipe +names, or a string defining just one pipe. + ```python # Enable only the parser with nlp.select_pipes(enable="parser"): doc = nlp("I will only be parsed") ``` - Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method to remove pipeline components from an existing pipeline, the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the @@ -349,12 +349,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no > nlp.add_pipe(my_component, before="parser") > ``` -| Argument | Type | Description | -| -------- | ------- | ------------------------------------------------------------------------ | -| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | -| `first` | bool | If set to `True`, component is added **first** in the pipeline. | -| `before` | unicode | String name of component to add the new component **before**. | -| `after` | unicode | String name of component to add the new component **after**. | +| Argument | Type | Description | +| -------- | ---- | ------------------------------------------------------------------------ | +| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | +| `first` | bool | If set to `True`, component is added **first** in the pipeline. | +| `before` | str | String name of component to add the new component **before**. | +| `after` | str | String name of component to add the new component **after**. | ### Example: A simple pipeline component {#custom-components-simple} diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 058204a5d..588782986 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab)) If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as well, which includes the values of -[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if -they're serializable with msgpack). +[extension attributes](/usage/processing-pipelines#custom-components-attributes) +(if they're serializable with msgpack). @@ -666,10 +666,10 @@ and lets you customize how the model should be initialized and loaded. You can define the language data to be loaded and the [processing pipeline](/usage/processing-pipelines) to execute. -| Setting | Type | Description | -| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | unicode | ID of the language class to initialize. | -| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | +| Setting | Type | Description | +| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | str | ID of the language class to initialize. | +| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | The `load()` method that comes with our model package templates will take care of putting all this together and returning a `Language` object with the loaded diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index dd0b0eb50..9733e09c2 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -67,12 +67,12 @@ arcs. -| Argument | Type | Description | Default | -| --------- | ------- | ----------------------------------------------------------- | ----------- | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` | -| `font` | unicode | Font name or font family for all text. | `"Arial"` | +| Argument | Type | Description | Default | +| --------- | ---- | ----------------------------------------------------------- | ----------- | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` | +| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` | +| `font` | str | Font name or font family for all text. | `"Arial"` | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options). From 1a15896ba9bcb2b12113880929edfb4fdf0683ff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 May 2020 18:51:10 +0200 Subject: [PATCH 144/187] unicode -> str consistency [ci skip] --- spacy/cli/info.py | 2 +- spacy/displacy/render.py | 2 +- spacy/gold.pyx | 4 ++-- spacy/language.py | 4 ++-- spacy/matcher/dependencymatcher.pyx | 2 +- spacy/matcher/matcher.pyx | 2 +- spacy/matcher/phrasematcher.pyx | 2 +- spacy/pipeline/entityruler.py | 2 +- spacy/strings.pyx | 6 +++--- spacy/tokenizer.pyx | 4 ++-- spacy/tokens/doc.pyx | 4 ++-- spacy/util.py | 8 ++++---- spacy/vocab.pyx | 10 +++++----- 13 files changed, 26 insertions(+), 26 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index d779eb2b3..98fd5cabf 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -65,7 +65,7 @@ def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be rendered as headline 2. + title (str / None): Title, will be rendered as headline 2. """ markdown = [] for key, value in data.items(): diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d3572ce78..ef8632cbc 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -302,7 +302,7 @@ class EntityRenderer(object): text (str): Original text. spans (list): Individual entity spans and their start, end and label. - title (unicode or None): Document title set in Doc.user_data['title']. + title (str / None): Document title set in Doc.user_data['title']. """ markup = "" offset = 0 diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1864b7a04..ecbd13354 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -140,8 +140,8 @@ class GoldCorpus(object): def __init__(self, train, dev, gold_preproc=False, limit=None): """Create a GoldCorpus. - train (unicode or Path): File or directory of training data. - dev (unicode or Path): File or directory of development data. + train (str / Path): File or directory of training data. + dev (str / Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ self.limit = limit diff --git a/spacy/language.py b/spacy/language.py index e3b770723..551b8c9af 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -934,7 +934,7 @@ class Language(object): """Save the current state to a directory. If a model is loaded, this will include the model. - path (unicode or Path): Path to a directory, which will be created if + path (str / Path): Path to a directory, which will be created if it doesn't exist. exclude (list): Names of components or serialization fields to exclude. @@ -968,7 +968,7 @@ class Language(object): returns it. If the saved `Language` object contains a model, the model will be loaded. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 732931380..ddeeedd06 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -194,7 +194,7 @@ cdef class DependencyMatcher: def get(self, key, default=None): """Retrieve the pattern stored for a key. - key (unicode or int): The key to retrieve. + key (str / int): The key to retrieve. RETURNS (tuple): The rule, as an (on_match, patterns) tuple. """ key = self._normalize_key(key) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 225eba9a9..868465b8d 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -165,7 +165,7 @@ cdef class Matcher: def get(self, key, default=None): """Retrieve the pattern stored for a key. - key (unicode or int): The key to retrieve. + key (str / int): The key to retrieve. RETURNS (tuple): The rule, as an (on_match, patterns) tuple. """ key = self._normalize_key(key) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index f7ce44ece..aa4534296 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -30,7 +30,7 @@ cdef class PhraseMatcher: """Initialize the PhraseMatcher. vocab (Vocab): The shared vocabulary. - attr (int / unicode): Token attribute to match on. + attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. RETURNS (PhraseMatcher): The newly constructed object. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index cdacc82f6..bdc009192 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -30,7 +30,7 @@ class EntityRuler(object): nlp (Language): The shared nlp object to pass the vocab to the matchers and process phrase patterns. - phrase_matcher_attr (int / unicode): Token attribute to match on, passed + phrase_matcher_attr (int / str): Token attribute to match on, passed to the internal PhraseMatcher as `attr` validate (bool): Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate` diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9fe5af154..9e584ce8a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -109,7 +109,7 @@ cdef class StringStore: """Retrieve a string from a given hash, or vice versa. string_or_id (bytes, unicode or uint64): The value to encode. - Returns (unicode or uint64): The value to be retrieved. + Returns (str / uint64): The value to be retrieved. """ if isinstance(string_or_id, basestring) and len(string_or_id) == 0: return 0 @@ -223,7 +223,7 @@ cdef class StringStore: def to_disk(self, path): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) @@ -234,7 +234,7 @@ cdef class StringStore: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either + path (str / Path): A path to a directory. Paths may be either strings or `Path`-like objects. RETURNS (StringStore): The modified `StringStore` object. """ diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b628b1171..538bf60e9 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -693,7 +693,7 @@ cdef class Tokenizer: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. exclude (list): String names of serialization fields to exclude. @@ -707,7 +707,7 @@ cdef class Tokenizer: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The modified `Tokenizer` object. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f6d0dbf4a..31c1e8c82 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -843,7 +843,7 @@ cdef class Doc: def to_disk(self, path, **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. exclude (list): String names of serialization fields to exclude. @@ -857,7 +857,7 @@ cdef class Doc: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either + path (str / Path): A path to a directory. Paths may be either strings or `Path`-like objects. exclude (list): String names of serialization fields to exclude. RETURNS (Doc): The modified `Doc` object. diff --git a/spacy/util.py b/spacy/util.py index fc5837755..b614c29c7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -269,7 +269,7 @@ def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. - path (unicode or Path): Path to the config file + path (str / Path): Path to the config file create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. @@ -286,7 +286,7 @@ def load_config_from_str(string, create_objects=False): """Load a Thinc-formatted config, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. - string (unicode or Path): Text contents of the config file. + string (str / Path): Text contents of the config file. create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. @@ -302,7 +302,7 @@ def load_config_from_str(string, create_objects=False): def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. - path (unicode or Path): Path to model directory. + path (str / Path): Path to model directory. RETURNS (dict): The model's meta data. """ model_path = ensure_path(path) @@ -321,7 +321,7 @@ def get_model_meta(path): def get_model_config(path): """Get the model's config from a directory path. - path (unicode or Path): Path to model directory. + path (str / Path): Path to model directory. RETURNS (Config): The model's config data. """ model_path = ensure_path(path) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ed37f6e98..3a82ab72d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -336,7 +336,7 @@ cdef class Vocab: If `minn` is defined, then the resulting vector uses Fasttext's subword features by average over ngrams of `orth`. - orth (int / unicode): The hash value of a word, or its unicode string. + orth (int / str): The hash value of a word, or its unicode string. minn (int): Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. maxn (int): Maximum n-gram length used for Fasttext's ngram computation. @@ -389,7 +389,7 @@ cdef class Vocab: """Set a vector for a word in the vocabulary. Words can be referenced by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set. DOCS: https://spacy.io/api/vocab#set_vector @@ -411,7 +411,7 @@ cdef class Vocab: """Check whether a word has a vector. Returns False if no vectors have been loaded. Words can be looked up by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. RETURNS (bool): Whether the word has a vector. DOCS: https://spacy.io/api/vocab#has_vector @@ -423,7 +423,7 @@ cdef class Vocab: def to_disk(self, path, exclude=tuple(), **kwargs): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str / Path): A path to a directory, which will be created if it doesn't exist. exclude (list): String names of serialization fields to exclude. @@ -448,7 +448,7 @@ cdef class Vocab: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str / Path): A path to a directory. exclude (list): String names of serialization fields to exclude. RETURNS (Vocab): The modified `Vocab` object. From 4fd087572a1c597781fef8ca4fbcfebed825c0fb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 28 May 2020 12:51:37 +0200 Subject: [PATCH 145/187] WIP: improve model version deps --- spacy/cli/package.py | 2 +- spacy/util.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index cf93c872f..15ae2033c 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -138,7 +138,7 @@ def list_files(data_dir): def list_requirements(meta): parent_package = meta.get('parent_package', 'spacy') - requirements = [parent_package + meta['spacy_version']] + requirements = [parent_package + '>=' + meta['spacy_version']] if 'setup_requires' in meta: requirements += meta['setup_requires'] if 'requirements' in meta: diff --git a/spacy/util.py b/spacy/util.py index b614c29c7..4e468ef9d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -265,6 +265,15 @@ def is_compatible_model(meta): return True +def get_model_version_range(version): + """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy + version. Models are always compatible across patch versions but not + across minor or major versions. + """ + major, minor = split_version(version) + return f">={version},<{major}.{minor + 1}.0" + + def load_config(path, create_objects=False): """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. From bed62991add4ff12282a00dd1d321441878b27ef Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 14:59:55 +0200 Subject: [PATCH 146/187] Tidy up requirements --- requirements.txt | 5 ++++- setup.cfg | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index add083a05..a104b68ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,8 +13,11 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 -importlib_metadata>=0.20; python_version < "3.8" pydantic>=1.3.0,<2.0.0 +# Official Python utilities +setuptools +packaging +importlib_metadata>=0.20; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index eb7608c4e..ae09d071c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,15 +47,16 @@ install_requires = wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets + ml_datasets>=0.1.1 # Third-party dependencies tqdm>=4.38.0,<5.0.0 - setuptools numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 - tqdm>=4.38.0,<5.0.0 + # Official Python utilities + setuptools + packaging importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] From e47e5a4b10e0d3c5b6fed255040cebc019173e39 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:01:58 +0200 Subject: [PATCH 147/187] Use more sophisticated version parsing logic --- spacy/cli/download.py | 7 +++--- spacy/cli/package.py | 4 ++-- spacy/cli/validate.py | 9 ++++---- spacy/language.py | 3 ++- spacy/tests/test_misc.py | 12 ++++++++-- spacy/util.py | 49 ++++++++++++++++------------------------ 6 files changed, 41 insertions(+), 43 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index af132bbbe..3d56822a5 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -5,7 +5,7 @@ import sys from wasabi import msg from .. import about -from ..util import is_package +from ..util import is_package, get_base_version def download( @@ -63,8 +63,7 @@ def get_json(url, desc): def get_compatibility(): - version = about.__version__ - version = version.rsplit(".dev", 1)[0] + version = get_base_version(about.__version__) comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: @@ -73,7 +72,7 @@ def get_compatibility(): def get_version(model, comp): - model = model.rsplit(".dev", 1)[0] + model = get_base_version(model) if model not in comp: msg.fail( f"No compatible model found for '{model}' (spaCy v{about.__version__})", diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 15ae2033c..153e61ba3 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -90,7 +90,7 @@ def generate_meta(model_path, existing_meta, msg): ("license", "License", meta.get("license", "MIT")), ] nlp = util.load_model_from_path(Path(model_path)) - meta["spacy_version"] = about.__version__ + meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["pipeline"] = nlp.pipe_names meta["vectors"] = { "width": nlp.vocab.vectors_length, @@ -138,7 +138,7 @@ def list_files(data_dir): def list_requirements(meta): parent_package = meta.get('parent_package', 'spacy') - requirements = [parent_package + '>=' + meta['spacy_version']] + requirements = [parent_package + meta['spacy_version']] if 'setup_requires' in meta: requirements += meta['setup_requires'] if 'requirements' in meta: diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index c39cadc7b..3c49abb3e 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -4,7 +4,7 @@ import requests from wasabi import msg from .. import about -from ..util import get_package_version, get_installed_models, split_version +from ..util import get_package_version, get_installed_models, get_base_version from ..util import get_package_path, get_model_meta, is_compatible_model @@ -14,7 +14,7 @@ def validate(): with the installed models. Should be run after `pip install -U spacy`. """ model_pkgs, compat = get_model_pkgs() - spacy_version = about.__version__.rsplit(".dev", 1)[0] + spacy_version = get_base_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible models found for v{spacy_version} of spaCy") @@ -78,13 +78,12 @@ def get_model_pkgs(): version = get_package_version(pkg_name) if package in compat: is_compat = version in compat[package] - v_maj, v_min = split_version(about.__version__) - spacy_version = f"{v_maj}.{v_min}" + spacy_version = about.__version__ else: model_path = get_package_path(package) model_meta = get_model_meta(model_path) - is_compat = is_compatible_model(model_meta) spacy_version = model_meta.get("spacy_version", "n/a") + is_compat = is_compatible_model(spacy_version) pkgs[pkg_name] = { "name": package, "version": version, diff --git a/spacy/language.py b/spacy/language.py index 551b8c9af..61d69b63e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -191,13 +191,14 @@ class Language(object): @property def meta(self): + spacy_version = util.get_model_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) else: self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", about.__version__) + self._meta.setdefault("spacy_version", spacy_version) self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 9e67ae83b..9aa95c431 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -95,7 +95,15 @@ def test_ascii_filenames(): @pytest.mark.parametrize( "version,compatible", - [(spacy_version, True), ("2.0.0", False), (">=1.2.3,<4.5.6", False)], + [ + (spacy_version, True), + (f">={spacy_version}", True), + ("2.0.0", False), + (">=2.0.0", True), + (">=1.0.0,<2.1.1", False), + (">=1.2.3,<4.5.6", True), + ("n/a", None), + ], ) def test_is_compatible_model(version, compatible): - assert util.is_compatible_model({"spacy_version": version}) is compatible + assert util.is_compatible_model(version) is compatible diff --git a/spacy/util.py b/spacy/util.py index 4e468ef9d..835e46fc6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,8 @@ import srsly import catalogue import sys import warnings +from packaging.specifiers import SpecifierSet, InvalidSpecifier +from packaging.version import Version, InvalidVersion try: @@ -236,42 +238,31 @@ def get_package_version(name): return None -def split_version(version): - """RETURNS (tuple): Two integers, the major and minor spaCy version.""" - pieces = version.split(".", 3) - return int(pieces[0]), int(pieces[1]) - - -def is_compatible_model(meta): - """Check if a model is compatible with the current version of spaCy, based - on its meta.json. We compare the version of spaCy the model was created with - with the current version. If the minor version is different, it's considered - incompatible. - - meta (dict): The model's meta. - RETURNS (bool / None): Whether the model is compatible with the current - spaCy or None if we don't have enough info. - """ - cur_v = about.__version__ - pkg_v = meta.get("spacy_version") - if not pkg_v or not isinstance(pkg_v, str): +def is_compatible_model(constraint): + version = Version(about.__version__) + if constraint[0].isdigit(): + # Handle cases where exact version is provided as constraint + constraint = f"=={constraint}" + try: + spec = SpecifierSet(constraint) + except InvalidSpecifier: return None - # Handle spacy_version values like >=x,=1.2.3,<1.3.0 based on a given spaCy version. Models are always compatible across patch versions but not across minor or major versions. """ - major, minor = split_version(version) - return f">={version},<{major}.{minor + 1}.0" + release = Version(spacy_version).release + return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" + + +def get_base_version(version): + return Version(version).base_version def load_config(path, create_objects=False): From a7e370bcbfd4234b53061a004c0b588e3ec76c06 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:03:18 +0200 Subject: [PATCH 148/187] Don't override spaCy version --- spacy/cli/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c205fa5b2..590ce4f13 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -467,7 +467,6 @@ def train( # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, From b7aff6020c34ecae3bb0891b469193d8772b8197 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:18:53 +0200 Subject: [PATCH 149/187] Make functions more general purpose and update docstrings and tests --- spacy/cli/validate.py | 4 ++-- spacy/tests/test_misc.py | 22 ++++++++++++---------- spacy/util.py | 27 +++++++++++++++++++++------ 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 3c49abb3e..080cd77e2 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -5,7 +5,7 @@ from wasabi import msg from .. import about from ..util import get_package_version, get_installed_models, get_base_version -from ..util import get_package_path, get_model_meta, is_compatible_model +from ..util import get_package_path, get_model_meta, is_compatible_version def validate(): @@ -83,7 +83,7 @@ def get_model_pkgs(): model_path = get_package_path(package) model_meta = get_model_meta(model_path) spacy_version = model_meta.get("spacy_version", "n/a") - is_compat = is_compatible_model(spacy_version) + is_compat = is_compatible_version(about.__version__, spacy_version) pkgs[pkg_name] = { "name": package, "version": version, diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 9aa95c431..e4b4e570c 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -94,16 +94,18 @@ def test_ascii_filenames(): @pytest.mark.parametrize( - "version,compatible", + "version,constraint,compatible", [ - (spacy_version, True), - (f">={spacy_version}", True), - ("2.0.0", False), - (">=2.0.0", True), - (">=1.0.0,<2.1.1", False), - (">=1.2.3,<4.5.6", True), - ("n/a", None), + (spacy_version, spacy_version, True), + (spacy_version, f">={spacy_version}", True), + ("3.0.0", "2.0.0", False), + ("3.2.1", ">=2.0.0", True), + ("2.2.10a1", ">=1.0.0,<2.1.1", False), + ("3.0.0.dev3", ">=1.2.3,<4.5.6", True), + ("n/a", ">=1.2.3,<4.5.6", None), + ("1.2.3", "n/a", None), + ("n/a", "n/a", None), ], ) -def test_is_compatible_model(version, compatible): - assert util.is_compatible_model(version) is compatible +def test_is_compatible_version(version, constraint, compatible): + assert util.is_compatible_version(version, constraint) is compatible diff --git a/spacy/util.py b/spacy/util.py index 835e46fc6..741b289c1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -238,17 +238,27 @@ def get_package_version(name): return None -def is_compatible_model(constraint): - version = Version(about.__version__) +def is_compatible_version(version, constraint, prereleases=True): + """Check if a version (e.g. "2.0.0") is compatible given a version + constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version, + it's interpreted as =={version}. + + version (str): The version to check. + constraint (str): The constraint string. + prereleases (bool): Whether to allow prereleases. If set to False, + prerelease versions will be considered incompatible. + RETURNS (bool / None): Whether the version is compatible, or None if the + version or constraint are invalid. + """ + # Handle cases where exact version is provided as constraint if constraint[0].isdigit(): - # Handle cases where exact version is provided as constraint constraint = f"=={constraint}" try: spec = SpecifierSet(constraint) - except InvalidSpecifier: + version = Version(version) + except (InvalidSpecifier, InvalidVersion): return None - # Allow prereleases and dev versions - spec.prereleases = True + spec.prereleases = prereleases return version in spec @@ -262,6 +272,11 @@ def get_model_version_range(spacy_version): def get_base_version(version): + """Generate the base version without any prerelease identifiers. + + version (str): The version, e.g. "3.0.0.dev1". + RETURNS (str): The base version, e.g. "3.0.0". + """ return Version(version).base_version From 368182776e61f6582223c02cf31b5eee65521d20 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:19:53 +0200 Subject: [PATCH 150/187] Tidy up dependencies --- setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index eb7608c4e..c5c39b447 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = wasabi>=0.4.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets + ml_datasets>=0.1.1 # Third-party dependencies tqdm>=4.38.0,<5.0.0 setuptools @@ -55,7 +55,6 @@ install_requires = plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 - tqdm>=4.38.0,<5.0.0 importlib_metadata>=0.20; python_version < "3.8" [options.extras_require] From dc186afdc5b7f42dd32eeafb239b3d5604b8fbbd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 30 May 2020 15:34:54 +0200 Subject: [PATCH 151/187] Add warning --- spacy/errors.py | 6 ++++++ spacy/util.py | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 932bb1eff..da2cfdf04 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -104,6 +104,12 @@ class Warnings(object): "string \"Field1=Value1,Value2|Field2=Value3\".") # TODO: fix numbering after merging develop into master + W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is " + "incompatible with the current version ({current}). This may lead " + "to unexpected results or runtime errors. To resolve this, " + "download a newer compatible model or retrain your custom model " + "with the current spaCy version. For more details and available " + "updates, run: python -m spacy validate") W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' " "instead.") W097 = ("No Model config was provided to create the '{name}' component, " diff --git a/spacy/util.py b/spacy/util.py index 741b289c1..79134400c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -330,6 +330,16 @@ def get_model_meta(path): for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) + if "spacy_version" in meta: + if not is_compatible_version(about.__version__, meta["spacy_version"]): + warnings.warn( + Warnings.W095.format( + model=f"{meta['lang']}_{meta['name']}", + model_version=meta["version"], + version=meta["spacy_version"], + current=about.__version__, + ) + ) return meta From cd5f748e0982524167e55884a7b1677a63b5b308 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Sat, 30 May 2020 20:27:47 +0200 Subject: [PATCH 152/187] Add onto-joint experiment file --- examples/experiments/onto-joint/defaults.cfg | 115 +++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 examples/experiments/onto-joint/defaults.cfg diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg new file mode 100644 index 000000000..fbac4ea7d --- /dev/null +++ b/examples/experiments/onto-joint/defaults.cfg @@ -0,0 +1,115 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 0 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.1 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 400 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +# These settings are invalid for the transformer models. +init_tok2vec = null +vectors = null + +[training.batch_size] +@schedules = "compounding.v1" +start = 1000 +stop = 1000 +compound = 1.001 + +[optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +#[optimizer.learn_rate] +#@schedules = "warmup_linear.v1" +#warmup_steps = 250 +#total_steps = 20000 +#initial_rate = 0.001 + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.senter] +factory = "senter" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.senter.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.senter.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +subword_features = true From e0f9f448f1305e382c5e7042d8bbac882fea9644 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 1 Jun 2020 23:38:48 +0200 Subject: [PATCH 153/187] remove Tensorizer --- examples/training/pretrain_textcat.py | 212 ------------------ spacy/language.py | 4 - spacy/ml/models/__init__.py | 1 - spacy/ml/models/tensorizer.py | 10 - spacy/pipeline/__init__.py | 3 +- spacy/pipeline/defaults/__init__.py | 10 - .../pipeline/defaults/tensorizer_defaults.cfg | 4 - spacy/pipeline/hooks.py | 6 +- spacy/pipeline/pipes.pyx | 136 +---------- .../serialize/test_serialize_pipeline.py | 22 +- 10 files changed, 8 insertions(+), 400 deletions(-) delete mode 100644 examples/training/pretrain_textcat.py delete mode 100644 spacy/ml/models/tensorizer.py delete mode 100644 spacy/pipeline/defaults/tensorizer_defaults.cfg diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py deleted file mode 100644 index 5c41c0e92..000000000 --- a/examples/training/pretrain_textcat.py +++ /dev/null @@ -1,212 +0,0 @@ -"""This script is experimental. - -Try pre-training the CNN component of the text categorizer using a cheap -language modelling-like objective. Specifically, we load pretrained vectors -(from something like word2vec, GloVe, FastText etc), and use the CNN to -predict the tokens' pretrained vectors. This isn't as easy as it sounds: -we're not merely doing compression here, because heavy dropout is applied, -including over the input words. This means the model must often (50% of the time) -use the context in order to predict the word. - -To evaluate the technique, we're pre-training with the 50k texts from the IMDB -corpus, and then training with only 100 labels. Note that it's a bit dirty to -pre-train with the development data, but also not *so* terrible: we're not using -the development labels, after all --- only the unlabelled text. -""" -import plac -import tqdm -import random - -import ml_datasets - -import spacy -from spacy.util import minibatch -from spacy.pipeline import TextCategorizer -from spacy.ml.models.tok2vec import build_Tok2Vec_model -import numpy - - -def load_texts(limit=0): - train, dev = ml_datasets.imdb() - train_texts, train_labels = zip(*train) - dev_texts, dev_labels = zip(*train) - train_texts = list(train_texts) - dev_texts = list(dev_texts) - random.shuffle(train_texts) - random.shuffle(dev_texts) - if limit >= 1: - return train_texts[:limit] - else: - return list(train_texts) + list(dev_texts) - - -def load_textcat_data(limit=0): - """Load data from the IMDB dataset.""" - # Partition off part of the train data for evaluation - train_data, eval_data = ml_datasets.imdb() - random.shuffle(train_data) - train_data = train_data[-limit:] - texts, labels = zip(*train_data) - eval_texts, eval_labels = zip(*eval_data) - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] - eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels] - return (texts, cats), (eval_texts, eval_cats) - - -def prefer_gpu(): - used = spacy.util.use_gpu(0) - if used is None: - return False - else: - import cupy.random - - cupy.random.seed(0) - return True - - -def build_textcat_model(tok2vec, nr_class, width): - from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged - - with Model.define_operators({">>": chain}): - model = ( - tok2vec - >> list2ragged() - >> reduce_mean() - >> Softmax(nr_class, width) - ) - model.set_ref("tok2vec", tok2vec) - return model - - -def block_gradients(model): - from thinc.api import wrap # TODO FIX - - def forward(X, drop=0.0): - Y, _ = model.begin_update(X, drop=drop) - return Y, None - - return wrap(forward, model) - - -def create_pipeline(width, embed_size, vectors_model): - print("Load vectors") - nlp = spacy.load(vectors_model) - print("Start training") - textcat = TextCategorizer( - nlp.vocab, - labels=["POSITIVE", "NEGATIVE"], - # TODO: replace with config version - model=build_textcat_model( - build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width - ), - ) - - nlp.add_pipe(textcat) - return nlp - - -def train_tensorizer(nlp, texts, dropout, n_iter): - tensorizer = nlp.create_pipe("tensorizer") - nlp.add_pipe(tensorizer) - optimizer = nlp.begin_training() - for i in range(n_iter): - losses = {} - for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): - docs = [nlp.make_doc(text) for text in batch] - tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout) - print(losses) - return optimizer - - -def train_textcat(nlp, n_texts, n_iter=10): - textcat = nlp.get_pipe("textcat") - tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes() - (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) - print( - "Using {} examples ({} training, {} evaluation)".format( - n_texts, len(train_texts), len(dev_texts) - ) - ) - train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) - - with nlp.select_pipes(enable="textcat"): # only train textcat - optimizer = nlp.begin_training() - textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) - print("Training the model...") - print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) - for i in range(n_iter): - losses = {"textcat": 0.0} - # batch up the examples using spaCy's minibatch - batches = minibatch(tqdm.tqdm(train_data), size=2) - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) - with textcat.model.use_params(optimizer.averages): - # evaluate on the dev data split off in load_data() - scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) - print( - "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table - losses["textcat"], - scores["textcat_p"], - scores["textcat_r"], - scores["textcat_f"], - ) - ) - - -def evaluate_textcat(tokenizer, textcat, texts, cats): - docs = (tokenizer(text) for text in texts) - tp = 1e-8 - fp = 1e-8 - tn = 1e-8 - fn = 1e-8 - for i, doc in enumerate(textcat.pipe(docs)): - gold = cats[i] - for label, score in doc.cats.items(): - if label not in gold: - continue - if score >= 0.5 and gold[label] >= 0.5: - tp += 1.0 - elif score >= 0.5 and gold[label] < 0.5: - fp += 1.0 - elif score < 0.5 and gold[label] < 0.5: - tn += 1 - elif score < 0.5 and gold[label] >= 0.5: - fn += 1 - precision = tp / (tp + fp) - recall = tp / (tp + fn) - f_score = 2 * (precision * recall) / (precision + recall) - return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} - - -@plac.annotations( - width=("Width of CNN layers", "positional", None, int), - embed_size=("Embedding rows", "positional", None, int), - pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), - train_examples=("Number of labelled examples", "option", "eg", int), - vectors_model=("Name or path to vectors model to learn from"), -) -def main( - width, - embed_size, - vectors_model, - pretrain_iters=30, - train_iters=30, - train_examples=1000, -): - random.seed(0) - numpy.random.seed(0) - use_gpu = prefer_gpu() - print("Using GPU?", use_gpu) - - nlp = create_pipeline(width, embed_size, vectors_model) - print("Load data") - texts = load_texts(limit=0) - print("Train tensorizer") - optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters) - print("Train textcat") - train_textcat(nlp, train_examples, n_iter=train_iters) - - -if __name__ == "__main__": - plac.call(main) diff --git a/spacy/language.py b/spacy/language.py index 61d69b63e..22360c65f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -225,10 +225,6 @@ class Language(object): # Conveniences to access pipeline components # Shouldn't be used anymore! - @property - def tensorizer(self): - return self.get_pipe("tensorizer") - @property def tagger(self): return self.get_pipe("tagger") diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index ef1e8efca..40cde2437 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -2,6 +2,5 @@ from .entity_linker import * # noqa from .parser import * # noqa from .simple_ner import * from .tagger import * # noqa -from .tensorizer import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py deleted file mode 100644 index f66610b64..000000000 --- a/spacy/ml/models/tensorizer.py +++ /dev/null @@ -1,10 +0,0 @@ -from thinc.api import Linear, zero_init - -from ... import util -from ...util import registry - - -@registry.architectures.register("spacy.Tensorizer.v1") -def build_tensorizer(input_size, output_size): - input_size = util.env_opt("token_vector_width", input_size) - return Linear(output_size, input_size, init_W=zero_init) diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index b2866bad2..116a08e92 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,5 +1,5 @@ from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker -from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import TextCategorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer from .simple_ner import SimpleNER from .morphologizer import Morphologizer @@ -14,7 +14,6 @@ __all__ = [ "EntityRecognizer", "EntityLinker", "TextCategorizer", - "Tensorizer", "Tok2Vec", "Pipe", "Morphologizer", diff --git a/spacy/pipeline/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py index e17e2d3b4..483c6bbd6 100644 --- a/spacy/pipeline/defaults/__init__.py +++ b/spacy/pipeline/defaults/__init__.py @@ -63,16 +63,6 @@ def default_tagger(): return util.load_config(loc, create_objects=True)["model"] -def default_tensorizer_config(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=False) - - -def default_tensorizer(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=True)["model"] - - def default_textcat_config(): loc = Path(__file__).parent / "textcat_defaults.cfg" return util.load_config(loc, create_objects=False) diff --git a/spacy/pipeline/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg deleted file mode 100644 index 81880a109..000000000 --- a/spacy/pipeline/defaults/tensorizer_defaults.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[model] -@architectures = "spacy.Tensorizer.v1" -input_size=96 -output_size=300 diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 351323ae9..a97e7be68 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -44,8 +44,8 @@ class SentenceSegmenter(object): class SimilarityHook(Pipe): """ Experimental: A pipeline component to install a hook for supervised - similarity into `Doc` objects. Requires a `Tensorizer` to pre-process - documents. The similarity model can be any object obeying the Thinc `Model` + similarity into `Doc` objects. + The similarity model can be any object obeying the Thinc `Model` interface. By default, the model concatenates the elementwise mean and elementwise max of the two tensors, and compares them using the Cauchy-like similarity function from Chen (2013): @@ -82,7 +82,7 @@ class SimilarityHook(Pipe): sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): - """Allocate model, using width from tensorizer in pipeline. + """Allocate model, using nO from the first model in the pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f75ed1659..cfe01981e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -16,7 +16,7 @@ from ..morphology cimport Morphology from ..vocab cimport Vocab from .defaults import default_tagger, default_parser, default_ner, default_textcat -from .defaults import default_nel, default_senter, default_tensorizer +from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj @@ -238,138 +238,6 @@ class Pipe(object): return self -@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer) -class Tensorizer(Pipe): - """Pre-train position-sensitive vectors for tokens.""" - - def __init__(self, vocab, model, **cfg): - """Construct a new statistical model. Weights are not allocated on - initialisation. - - vocab (Vocab): A `Vocab` instance. The model must share the same - `Vocab` instance with the `Doc` objects it will process. - **cfg: Config parameters. - """ - self.vocab = vocab - self.model = model - self.input_models = [] - self.cfg = dict(cfg) - - def __call__(self, example): - """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM - model. Vectors are set to the `Doc.tensor` attribute. - - docs (Doc or iterable): One or more documents to add vectors to. - RETURNS (dict or None): Intermediate computations. - """ - doc = self._get_doc(example) - tokvecses = self.predict([doc]) - self.set_annotations([doc], tokvecses) - if isinstance(example, Example): - example.doc = doc - return example - return doc - - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - """Process `Doc` objects as a stream. - - stream (iterator): A sequence of `Doc` or `Example` objects to process. - batch_size (int): Number of `Doc` or `Example` objects to group. - YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input. - """ - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] - tensors = self.predict(docs) - self.set_annotations(docs, tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs - - def predict(self, docs): - """Return a single tensor for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - RETURNS (object): Vector representations for each token in the docs. - """ - inputs = self.model.ops.flatten([doc.tensor for doc in docs]) - outputs = self.model(inputs) - return self.model.ops.unflatten(outputs, [len(d) for d in docs]) - - def set_annotations(self, docs, tensors): - """Set the tensor attribute for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - tensors (object): Vector representation for each token in the docs. - """ - for doc, tensor in zip(docs, tensors): - if tensor.shape[0] != len(doc): - raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) - doc.tensor = tensor - - def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None): - """Update the model. - - docs (iterable): A batch of `Doc` objects. - golds (iterable): A batch of `GoldParse` objects. - drop (float): The dropout rate. - sgd (callable): An optimizer. - RETURNS (dict): Results from the update. - """ - examples = Example.to_example_objects(examples) - inputs = [] - bp_inputs = [] - set_dropout_rate(self.model, drop) - for tok2vec in self.input_models: - set_dropout_rate(tok2vec, drop) - tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples]) - inputs.append(tensor) - bp_inputs.append(bp_tensor) - inputs = self.model.ops.xp.hstack(inputs) - scores, bp_scores = self.model.begin_update(inputs) - loss, d_scores = self.get_loss(examples, scores) - d_inputs = bp_scores(d_scores, sgd=sgd) - d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) - for d_input, bp_input in zip(d_inputs, bp_inputs): - bp_input(d_input) - if sgd is not None: - for tok2vec in self.input_models: - tok2vec.finish_update(sgd) - self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - return loss - - def get_loss(self, examples, prediction): - examples = Example.to_example_objects(examples) - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) - target = self.vocab.vectors.data[ids] - d_scores = (prediction - target) / prediction.shape[0] - loss = (d_scores ** 2).sum() - return loss, d_scores - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - """Allocate models, pre-process training data and acquire an - optimizer. - - get_examples (iterable): Gold-standard training data. - pipeline (list): The pipeline the model is part of. - """ - if pipeline is not None: - for name, model in pipeline: - if model.has_ref("tok2vec"): - self.input_models.append(model.get_ref("tok2vec")) - self.model.initialize() - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - @component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. @@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg): warnings.warn(Warnings.W098.format(name="ner")) return EntityRecognizer.from_nlp(nlp, model, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 4fc277c4f..595a35a9f 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,7 +1,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer -from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger +from spacy.pipeline import TextCategorizer, SentenceRecognizer +from spacy.pipeline.defaults import default_parser, default_tagger from spacy.pipeline.defaults import default_textcat, default_senter from ..util import make_tempdir @@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): assert tagger1_d.to_bytes() == tagger2_d.to_bytes() -def test_serialize_tensorizer_roundtrip_bytes(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) - new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b) - assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b - - -def test_serialize_tensorizer_roundtrip_disk(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - with make_tempdir() as d: - file_path = d / "tensorizer" - tensorizer.to_disk(file_path) - tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path) - assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( - exclude=["vocab"] - ) - - def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer( From ec52e7f886ad3839bb509c38707a8ae4e955b7d4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 13:21:55 +0200 Subject: [PATCH 154/187] add oversize examples before StopIteration returns --- spacy/util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 79134400c..54ecb6edd 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -681,6 +681,9 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0 try: example = next(examples) except StopIteration: + if oversize: + example = oversize.pop(0) + batch.append(example) if batch: yield batch return From fdfd82293688678b1590d680f758c32da3c83d73 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 15:22:54 +0200 Subject: [PATCH 155/187] rewrite minibatch_by_words function --- spacy/util.py | 60 ++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 54ecb6edd..0f8de3ddf 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -656,45 +656,47 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2): +def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by - themselves.""" + themselves, or be discarded if discard_oversize=True.""" if isinstance(size, int): size_ = itertools.repeat(size) elif isinstance(size, List): size_ = iter(size) else: size_ = size - examples = iter(examples) - oversize = [] - while True: - batch_size = next(size_) - tol_size = batch_size * 0.2 - batch = [] - if oversize: - example = oversize.pop(0) - n_words = count_words(example.doc) + + target_size = next(size_) + tol_size = target_size * tolerance + batch = [] + current_size = 0 + + for example in examples: + n_words = count_words(example.doc) + # add the example to the current batch if it still fits + if (current_size + n_words) < (target_size + tol_size): batch.append(example) - batch_size -= n_words - while batch_size >= 1: - try: - example = next(examples) - except StopIteration: - if oversize: - example = oversize.pop(0) - batch.append(example) - if batch: - yield batch - return - n_words = count_words(example.doc) - if n_words < (batch_size + tol_size): - batch_size -= n_words - batch.append(example) + current_size += n_words + else: + # if the current example exceeds the batch size, it is returned separately + # but only if discard_oversize=False. + if current_size > target_size: + if not discard_oversize: + yield [example] + # yield the previous batch and start a new one else: - oversize.append(example) - if batch: - yield batch + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + # In theory it may happen that the current example now exceeds the new target_size, + # but that seems like an unimportant edge case if batch sizes are variable anyway? + batch = [example] + current_size = n_words + + # yield the final batch + if batch: + yield batch def itershuffle(iterable, bufsize=1000): From 5b350a6c9998ccb53439f2721159ab92ca61003f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 17:49:33 +0200 Subject: [PATCH 156/187] bugfix of the bugfix --- spacy/util.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 0f8de3ddf..f5ca49637 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -674,25 +674,26 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o for example in examples: n_words = count_words(example.doc) + # if the current example exceeds the batch size, it is returned separately + # but only if discard_oversize=False. + if n_words > target_size: + if not discard_oversize: + yield [example] + # add the example to the current batch if it still fits - if (current_size + n_words) < (target_size + tol_size): + elif (current_size + n_words) < (target_size + tol_size): batch.append(example) current_size += n_words + + # yield the previous batch and start a new one else: - # if the current example exceeds the batch size, it is returned separately - # but only if discard_oversize=False. - if current_size > target_size: - if not discard_oversize: - yield [example] - # yield the previous batch and start a new one - else: - yield batch - target_size = next(size_) - tol_size = target_size * tolerance - # In theory it may happen that the current example now exceeds the new target_size, - # but that seems like an unimportant edge case if batch sizes are variable anyway? - batch = [example] - current_size = n_words + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + # In theory it may happen that the current example now exceeds the new target_size, + # but that seems like an unimportant edge case if batch sizes are variable anyway? + batch = [example] + current_size = n_words # yield the final batch if batch: From 85b0597ed5f8e23de337f56966e4b342827a99c3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 18:26:21 +0200 Subject: [PATCH 157/187] add test for minibatch util --- spacy/tests/test_util.py | 23 +++++++++++++++++++++++ spacy/tests/util.py | 7 +++++++ 2 files changed, 30 insertions(+) create mode 100644 spacy/tests/test_util.py diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py new file mode 100644 index 000000000..382a8f548 --- /dev/null +++ b/spacy/tests/test_util.py @@ -0,0 +1,23 @@ +import pytest +from spacy.gold import Example + +from .util import get_doc + +from spacy.util import minibatch_by_words + + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 400, 199], [3]), + ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 250], [3, 2]), + ], +) +def test_util_minibatch(doc_sizes, expected_batches): + docs = [get_doc(doc_size) for doc_size in doc_sizes] + + examples = [Example(doc=doc) for doc in docs] + + batches = list(minibatch_by_words(examples=examples, size=1000)) + assert [len(batch) for batch in batches] == expected_batches diff --git a/spacy/tests/util.py b/spacy/tests/util.py index e29342268..73650a6f7 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -92,6 +92,13 @@ def get_batch(batch_size): return docs +def get_doc(n_words): + vocab = Vocab() + # Make the words numbers, so that they're easy to track. + numbers = [str(i) for i in range(0, n_words)] + return Doc(vocab, words=numbers) + + def apply_transition_sequence(parser, doc, sequence): """Perform a series of pre-specified transitions, to put the parser in a desired state.""" From 6651fafd5cad7edf34dfb1374c962dff6ce901e9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:43:39 +0200 Subject: [PATCH 158/187] using overflow buffer for examples within the tolerance margin --- spacy/tests/test_util.py | 4 ++-- spacy/util.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 382a8f548..93201eb4b 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -11,13 +11,13 @@ from spacy.util import minibatch_by_words [ ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), + ([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 250], [3, 2]), + ([400, 400, 199, 3, 1, 250], [3, 3]), ], ) def test_util_minibatch(doc_sizes, expected_batches): docs = [get_doc(doc_size) for doc_size in doc_sizes] - examples = [Example(doc=doc) for doc in docs] - batches = list(minibatch_by_words(examples=examples, size=1000)) assert [len(batch) for batch in batches] == expected_batches diff --git a/spacy/util.py b/spacy/util.py index f5ca49637..8ac2fd370 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -670,7 +670,9 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o target_size = next(size_) tol_size = target_size * tolerance batch = [] + overflow = [] current_size = 0 + overflow_size = 0 for example in examples: n_words = count_words(example.doc) @@ -681,10 +683,15 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o yield [example] # add the example to the current batch if it still fits - elif (current_size + n_words) < (target_size + tol_size): + elif (current_size + n_words) < target_size: batch.append(example) current_size += n_words + # add the example to the overflow buffer if it fits in the tolerance margins + elif (current_size + n_words) < (target_size + tol_size): + overflow.append(example) + overflow_size += n_words + # yield the previous batch and start a new one else: yield batch @@ -692,11 +699,15 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o tol_size = target_size * tolerance # In theory it may happen that the current example now exceeds the new target_size, # but that seems like an unimportant edge case if batch sizes are variable anyway? - batch = [example] - current_size = n_words + batch = overflow + batch.append(example) + current_size = overflow_size + n_words + overflow = [] + overflow_size = 0 # yield the final batch if batch: + batch.extend(overflow) yield batch From 6208d322d383455ea91c1e30b2c834a08e2cbbf0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:47:30 +0200 Subject: [PATCH 159/187] slightly more challenging unit test --- spacy/tests/test_util.py | 4 ++-- spacy/util.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 93201eb4b..a0c6ab6c0 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -12,8 +12,8 @@ from spacy.util import minibatch_by_words ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 250], [3, 2]), - ([400, 400, 199, 3, 1, 250], [3, 3]), + ([400, 400, 199, 3, 200], [3, 2]), + ([400, 400, 199, 3, 1, 200], [3, 3]), ], ) def test_util_minibatch(doc_sizes, expected_batches): diff --git a/spacy/util.py b/spacy/util.py index 8ac2fd370..b4e6f7fb1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -682,13 +682,13 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o if not discard_oversize: yield [example] - # add the example to the current batch if it still fits - elif (current_size + n_words) < target_size: + # add the example to the current batch if it still fits and there's no overflow yet + elif overflow_size == 0 and (current_size + n_words) < target_size: batch.append(example) current_size += n_words # add the example to the overflow buffer if it fits in the tolerance margins - elif (current_size + n_words) < (target_size + tol_size): + elif (current_size + overflow_size + n_words) < (target_size + tol_size): overflow.append(example) overflow_size += n_words From ef834b4cd7f51d24b2df451b091caaf21586d199 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:50:44 +0200 Subject: [PATCH 160/187] fix comments --- spacy/util.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index b4e6f7fb1..3f7a96a19 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -682,23 +682,23 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o if not discard_oversize: yield [example] - # add the example to the current batch if it still fits and there's no overflow yet + # add the example to the current batch if there's no overflow yet and it still fits elif overflow_size == 0 and (current_size + n_words) < target_size: batch.append(example) current_size += n_words - # add the example to the overflow buffer if it fits in the tolerance margins + # add the example to the overflow buffer if it fits in the tolerance margin elif (current_size + overflow_size + n_words) < (target_size + tol_size): overflow.append(example) overflow_size += n_words - # yield the previous batch and start a new one + # yield the previous batch and start a new one. The new one gets the overflow examples. else: yield batch target_size = next(size_) tol_size = target_size * tolerance - # In theory it may happen that the current example now exceeds the new target_size, - # but that seems like an unimportant edge case if batch sizes are variable anyway? + # In theory it may happen that the current example + overflow examples now exceed the new + # target_size, but that seems like an unimportant edge case if batch sizes are variable? batch = overflow batch.append(example) current_size = overflow_size + n_words From f2e162fc60dab95e16efbb7310e4745689cb886c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 19:59:04 +0200 Subject: [PATCH 161/187] it's only oversized if the tolerance level is also exceeded --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 3f7a96a19..598545b84 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -678,7 +678,7 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o n_words = count_words(example.doc) # if the current example exceeds the batch size, it is returned separately # but only if discard_oversize=False. - if n_words > target_size: + if n_words > target_size + tol_size: if not discard_oversize: yield [example] From aa6271b16ca653d24010a5bf325fcc36ac757361 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 22:05:08 +0200 Subject: [PATCH 162/187] extending algorithm to deal better with edge cases --- spacy/tests/test_util.py | 20 ++++++++++++++++++-- spacy/util.py | 33 ++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index a0c6ab6c0..207805c81 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -11,13 +11,29 @@ from spacy.util import minibatch_by_words [ ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), - ([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 200], [3, 2]), + + ([400, 400, 199, 3, 1], [5]), + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), + ([400, 400, 199, 3, 1, 999], [3, 3]), + ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), + + ([1, 2, 999], [3]), + ([1, 2, 999, 1], [4]), + ([1, 200, 999, 1], [2, 2]), + ([1, 999, 200, 1], [2, 2]), ], ) def test_util_minibatch(doc_sizes, expected_batches): docs = [get_doc(doc_size) for doc_size in doc_sizes] examples = [Example(doc=doc) for doc in docs] - batches = list(minibatch_by_words(examples=examples, size=1000)) + tol = 0.2 + batch_size = 1000 + batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) assert [len(batch) for batch in batches] == expected_batches + + max_size = batch_size + batch_size * tol + for batch in batches: + assert sum([len(example.doc) for example in batch]) < max_size + diff --git a/spacy/util.py b/spacy/util.py index 598545b84..2d732e2b7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -671,24 +671,24 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o tol_size = target_size * tolerance batch = [] overflow = [] - current_size = 0 + batch_size = 0 overflow_size = 0 for example in examples: n_words = count_words(example.doc) - # if the current example exceeds the batch size, it is returned separately + # if the current example exceeds the maximum batch size, it is returned separately # but only if discard_oversize=False. if n_words > target_size + tol_size: if not discard_oversize: yield [example] # add the example to the current batch if there's no overflow yet and it still fits - elif overflow_size == 0 and (current_size + n_words) < target_size: + elif overflow_size == 0 and (batch_size + n_words) <= target_size: batch.append(example) - current_size += n_words + batch_size += n_words # add the example to the overflow buffer if it fits in the tolerance margin - elif (current_size + overflow_size + n_words) < (target_size + tol_size): + elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): overflow.append(example) overflow_size += n_words @@ -697,14 +697,29 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o yield batch target_size = next(size_) tol_size = target_size * tolerance - # In theory it may happen that the current example + overflow examples now exceed the new - # target_size, but that seems like an unimportant edge case if batch sizes are variable? batch = overflow - batch.append(example) - current_size = overflow_size + n_words + batch_size = overflow_size overflow = [] overflow_size = 0 + # this example still fits + if (batch_size + n_words) <= target_size: + batch.append(example) + batch_size += n_words + + # this example fits in overflow + elif (batch_size + n_words) <= (target_size + tol_size): + overflow.append(example) + overflow_size += n_words + + # this example does not fit with the previous overflow: start another new batch + else: + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + batch = [example] + batch_size = n_words + # yield the final batch if batch: batch.extend(overflow) From 2bf5111ecf369a2e5b807067823aadcdc635bc70 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 22:09:37 +0200 Subject: [PATCH 163/187] additional test with discard_oversize=False --- spacy/tests/test_util.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 207805c81..6b6e84a17 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -12,13 +12,11 @@ from spacy.util import minibatch_by_words ([400, 400, 199], [3]), ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 200], [3, 2]), - ([400, 400, 199, 3, 1], [5]), ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), - ([1, 2, 999], [3]), ([1, 2, 999, 1], [4]), ([1, 200, 999, 1], [2, 2]), @@ -37,3 +35,25 @@ def test_util_minibatch(doc_sizes, expected_batches): for batch in batches: assert sum([len(example.doc) for example in batch]) < max_size + +@pytest.mark.parametrize( + "doc_sizes, expected_batches", + [ + ([400, 4000, 199], [1, 2]), + ([400, 400, 199, 3000, 200], [1, 4]), + ([400, 400, 199, 3, 1, 1500], [1, 5]), + ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]), + ([1, 2, 9999], [1, 2]), + ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]), + ], +) +def test_util_minibatch_oversize(doc_sizes, expected_batches): + """ Test that oversized documents are returned in their own batch""" + docs = [get_doc(doc_size) for doc_size in doc_sizes] + examples = [Example(doc=doc) for doc in docs] + tol = 0.2 + batch_size = 1000 + batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) + assert [len(batch) for batch in batches] == expected_batches + + From c5ac382f0aaa03b6ca80d6ad61b11b325ee46702 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Jun 2020 22:24:57 +0200 Subject: [PATCH 164/187] fix name clash --- spacy/tests/test_util.py | 6 +++--- spacy/tests/util.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 6b6e84a17..1410755db 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,7 +1,7 @@ import pytest from spacy.gold import Example -from .util import get_doc +from .util import get_random_doc from spacy.util import minibatch_by_words @@ -24,7 +24,7 @@ from spacy.util import minibatch_by_words ], ) def test_util_minibatch(doc_sizes, expected_batches): - docs = [get_doc(doc_size) for doc_size in doc_sizes] + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 @@ -49,7 +49,7 @@ def test_util_minibatch(doc_sizes, expected_batches): ) def test_util_minibatch_oversize(doc_sizes, expected_batches): """ Test that oversized documents are returned in their own batch""" - docs = [get_doc(doc_size) for doc_size in doc_sizes] + docs = [get_random_doc(doc_size) for doc_size in doc_sizes] examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 73650a6f7..3d0a023c9 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -92,7 +92,7 @@ def get_batch(batch_size): return docs -def get_doc(n_words): +def get_random_doc(n_words): vocab = Vocab() # Make the words numbers, so that they're easy to track. numbers = [str(i) for i in range(0, n_words)] From 03c58b488c2a28d70995447fba5ab6610520d970 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 10:00:21 +0200 Subject: [PATCH 165/187] prevent infinite loop, custom warning --- spacy/cli/train_from_config.py | 27 ++++++++++++++++++--------- spacy/errors.py | 2 ++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c0e3bd169..852f456de 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -13,6 +13,7 @@ import random from ..gold import GoldCorpus from .. import util from ..errors import Errors +from ..ml import models # don't remove - required to load the built-in architectures registry = util.registry @@ -75,7 +76,7 @@ maxout_pieces = 3 subword_features = true """ - +# TODO: REMOVE ? class PipelineComponent(BaseModel): factory: str model: Model @@ -83,7 +84,7 @@ class PipelineComponent(BaseModel): class Config: arbitrary_types_allowed = True - +# TODO: REMOVE ? class ConfigSchema(BaseModel): optimizer: Optional["Optimizer"] @@ -123,7 +124,7 @@ class ConfigSchema(BaseModel): use_gpu=("Use GPU", "option", "g", int), # fmt: on ) -def train_from_config_cli( +def train_cli( train_path, dev_path, config_path, @@ -132,7 +133,7 @@ def train_from_config_cli( raw_text=None, debug=False, verbose=False, - use_gpu=-1 + use_gpu=-1, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -156,7 +157,7 @@ def train_from_config_cli( else: msg.info("Using CPU") - train_from_config( + train( config_path, {"train": train_path, "dev": dev_path}, output_path=output_path, @@ -165,10 +166,11 @@ def train_from_config_cli( ) -def train_from_config( +def train( config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): msg.info(f"Loading config from: {config_path}") + # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"]["use_pytorch_for_gpu_memory"]: @@ -177,8 +179,8 @@ def train_from_config( config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) - optimizer = config["optimizer"] training = config["training"] + optimizer = training["optimizer"] limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) @@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg): if len(train_examples) == 0: raise ValueError(Errors.E988) random.shuffle(train_examples) - batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) + batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"]) + # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop + try: + first = next(batches) + yield first + except StopIteration: + raise ValueError(Errors.E986) for batch in batches: yield batch epochs_todo -= 1 # We intentionally compare exactly to 0 here, so that max_epochs < 1 # will not break. - if epochs_todo == 0: + if epochs_todo == 0: break @@ -366,6 +374,7 @@ def train_while_improving( # Stop if we've exhausted our max steps (if specified) if max_steps and (step * accumulate_gradient) >= max_steps: break + step += 1 def subdivide_batch(batch, accumulate_gradient): diff --git a/spacy/errors.py b/spacy/errors.py index da2cfdf04..852c55225 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -554,6 +554,8 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E986 = ("Could not create any training batches: check your input. " + "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " "a string, but found {type} instead.") E988 = ("Could not parse any training examples. Ensure the data is " From e91485dfc464744d1c2d1ea9e648efeea9e403a1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 10:04:16 +0200 Subject: [PATCH 166/187] add discard_oversize parameter, move optimizer to training subsection --- examples/experiments/onto-joint/defaults.cfg | 3 ++- examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg | 3 ++- examples/experiments/ptb-joint-pos-dep/defaults.cfg | 3 ++- examples/experiments/tok2vec-ner/charembed_tok2vec.cfg | 3 ++- .../experiments/tok2vec-ner/multihashembed_tok2vec.cfg | 3 ++- spacy/__main__.py | 7 +++---- spacy/cli/__init__.py | 3 +-- spacy/cli/train_from_config.py | 1 - spacy/ml/__init__.py | 1 + 9 files changed, 15 insertions(+), 12 deletions(-) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index fbac4ea7d..0fdbc5cf5 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -25,6 +25,7 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null vectors = null +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -32,7 +33,7 @@ start = 1000 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index e152fa5e0..fdd4139f8 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -21,7 +22,7 @@ start = 100 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 9a10c45f0..5b369d782 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -21,7 +22,7 @@ start = 100 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index 796c8670f..8e5c3a276 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -12,8 +12,9 @@ max_length = 0 batch_size = 25 seed = 0 accumulate_gradient = 2 +discard_oversize = false -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index 3ac70675b..149b8ea66 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -11,6 +11,7 @@ gold_preproc = true max_length = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -19,7 +20,7 @@ stop = 3000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/spacy/__main__.py b/spacy/__main__.py index 71ab1a91a..beed3170d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -2,16 +2,15 @@ if __name__ == "__main__": import plac import sys from wasabi import msg - from spacy.cli import download, link, info, package, train, pretrain, convert + from spacy.cli import download, link, info, package, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data - from spacy.cli import train_from_config_cli + from spacy.cli import train_cli commands = { "download": download, "link": link, "info": info, - "train": train, - "train-from-config": train_from_config_cli, + "train": train_cli, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5f83b26c1..2ffbe2d0c 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,8 +4,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train import train # noqa: F401 -from .train_from_config import train_from_config_cli # noqa: F401 +from .train_from_config import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 852f456de..9cdc3bf2f 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -374,7 +374,6 @@ def train_while_improving( # Stop if we've exhausted our max steps (if specified) if max_steps and (step * accumulate_gradient) >= max_steps: break - step += 1 def subdivide_batch(batch, accumulate_gradient): diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index e69de29bb..cf4f59d6c 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -0,0 +1 @@ +from .models import * \ No newline at end of file From eac12cbb773912d274a2e4eb5090b8fe89992ef4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 11:50:16 +0200 Subject: [PATCH 167/187] make dropout in embed layers configurable --- spacy/ml/models/textcat.py | 17 ++++---- spacy/ml/models/tok2vec.py | 40 ++++++++++--------- .../defaults/entity_linker_defaults.cfg | 1 + .../defaults/morphologizer_defaults.cfg | 1 + spacy/pipeline/defaults/ner_defaults.cfg | 1 + spacy/pipeline/defaults/parser_defaults.cfg | 1 + spacy/pipeline/defaults/senter_defaults.cfg | 1 + .../pipeline/defaults/simple_ner_defaults.cfg | 1 + spacy/pipeline/defaults/tagger_defaults.cfg | 1 + .../defaults/textcat_cnn_defaults.cfg | 1 + spacy/pipeline/defaults/textcat_defaults.cfg | 1 + spacy/pipeline/defaults/tok2vec_defaults.cfg | 1 + spacy/tests/pipeline/test_textcat.py | 6 +-- .../tests/serialize/test_serialize_config.py | 3 ++ spacy/tests/test_tok2vec.py | 19 ++++----- 15 files changed, 57 insertions(+), 38 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ce31d058c..141c66f79 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, nO=None): + window_size, conv_depth, dropout, nO=None): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER)) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE)) + lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) + prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class @registry.architectures.register("spacy.TextCatLowData.v1") -def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): +def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): nlp = util.load_model(pretrained_vectors) vectors = nlp.vocab.vectors vector_dim = vectors.data.shape[1] @@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): >> reduce_sum() >> residual(Relu(width, width)) ** 2 >> Linear(nO, width) - >> Dropout(0.0) - >> Logistic() ) + if dropout: + model = model >> Dropout(dropout) + model = model >> Logistic() return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index a2e8f589a..53798e57c 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -49,6 +49,7 @@ def hash_embed_cnn( maxout_pieces, window_size, subword_features, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -63,6 +64,7 @@ def hash_embed_cnn( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @@ -76,6 +78,7 @@ def hash_charembed_cnn( window_size, nM, nC, + dropout, ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -90,12 +93,13 @@ def hash_charembed_cnn( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces + pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -110,12 +114,13 @@ def hash_embed_bilstm_v1( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC + pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces): @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX")) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX")) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE")) + prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) + suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) + shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) if pretrained_vectors: glove = StaticVectors( vectors=pretrained_vectors.data, nO=width, column=columns.index(ID), - dropout=0.0, + dropout=dropout, ) with Model.define_operators({">>": chain, "|": concatenate}): @@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): embed_layer = norm else: if use_subwords and pretrained_vectors: - nr_columns = 5 concat_columns = glove | norm | prefix | suffix | shape elif use_subwords: - nr_columns = 4 concat_columns = norm | prefix | suffix | shape else: - nr_columns = 2 concat_columns = glove | norm embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) @@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(columns, width, rows, nM, nC, features): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) with Model.define_operators({">>": chain, "|": concatenate}): embed_layer = chr_embed | features >> with_array(norm) @@ -238,16 +241,17 @@ def build_Tok2Vec_model( nC, conv_depth, bilstm_depth, + dropout, ) -> Model: if char_embed: subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM)) + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE)) + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: @@ -255,7 +259,7 @@ def build_Tok2Vec_model( vectors=pretrained_vectors.data, nO=width, column=cols.index(ID), - dropout=0.0, + dropout=dropout, ) if subword_features: diff --git a/spacy/pipeline/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg index 6a591ec3e..26a294f37 100644 --- a/spacy/pipeline/defaults/entity_linker_defaults.cfg +++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 300 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg index 150eca507..c4452c689 100644 --- a/spacy/pipeline/defaults/morphologizer_defaults.cfg +++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg @@ -11,3 +11,4 @@ window_size = 1 maxout_pieces = 3 nM = 64 nC = 8 +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/ner_defaults.cfg b/spacy/pipeline/defaults/ner_defaults.cfg index db2c131f5..eb926c43b 100644 --- a/spacy/pipeline/defaults/ner_defaults.cfg +++ b/spacy/pipeline/defaults/ner_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/parser_defaults.cfg b/spacy/pipeline/defaults/parser_defaults.cfg index 9cbb6eadb..6fe0fd7cb 100644 --- a/spacy/pipeline/defaults/parser_defaults.cfg +++ b/spacy/pipeline/defaults/parser_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/senter_defaults.cfg b/spacy/pipeline/defaults/senter_defaults.cfg index ffa2c6ce2..304e42b01 100644 --- a/spacy/pipeline/defaults/senter_defaults.cfg +++ b/spacy/pipeline/defaults/senter_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 2 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/simple_ner_defaults.cfg b/spacy/pipeline/defaults/simple_ner_defaults.cfg index 4e3b640df..7f206a636 100644 --- a/spacy/pipeline/defaults/simple_ner_defaults.cfg +++ b/spacy/pipeline/defaults/simple_ner_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 7000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/tagger_defaults.cfg b/spacy/pipeline/defaults/tagger_defaults.cfg index 5aea80a32..f26c5f099 100644 --- a/spacy/pipeline/defaults/tagger_defaults.cfg +++ b/spacy/pipeline/defaults/tagger_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg index cea1bfe54..91f3a1742 100644 --- a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg @@ -11,3 +11,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg index 9477b2995..e5817de4a 100644 --- a/spacy/pipeline/defaults/textcat_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_defaults.cfg @@ -7,3 +7,4 @@ conv_depth = 2 embed_size = 2000 window_size = 1 ngram_size = 1 +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg index 9475d4aab..36bf0c3da 100644 --- a/spacy/pipeline/defaults/tok2vec_defaults.cfg +++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg @@ -7,3 +7,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null \ No newline at end of file diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 725a4fd69..179659597 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -123,9 +123,9 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False}, ], diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ba63adfa4..870a980f2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -24,6 +24,7 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null [nlp.pipeline.tagger] factory = "tagger" @@ -53,6 +54,7 @@ embed_size = 5555 window_size = 1 maxout_pieces = 7 subword_features = false +dropout = null """ @@ -70,6 +72,7 @@ def my_parser(): nC=8, conv_depth=2, bilstm_depth=0, + dropout=None, ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9c2e9004b..ee1f9dead 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -15,7 +15,7 @@ def test_empty_doc(): vocab = Vocab() doc = Doc(vocab, words=[]) # TODO: fix tok2vec arguments - tok2vec = build_Tok2Vec_model(width, embed_size) + tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) @@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): char_embed=False, nM=64, nC=8, + dropout=None, ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "tok2vec_config", [ - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, ], ) # fmt: on From 109bbdab98735def2f106d113094fc880d2b2382 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 11:53:59 +0200 Subject: [PATCH 168/187] update config files with separate dropout for Tok2Vec layer --- examples/experiments/onto-joint/defaults.cfg | 1 + examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg | 1 + examples/experiments/ptb-joint-pos-dep/defaults.cfg | 1 + examples/experiments/tok2vec-ner/charembed_tok2vec.cfg | 1 + examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg | 1 + 5 files changed, 5 insertions(+) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 0fdbc5cf5..6c3a21f4b 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -114,3 +114,4 @@ window_size = 1 embed_size = 10000 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index fdd4139f8..52faad9ec 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -66,3 +66,4 @@ depth = 4 embed_size = 2000 subword_features = true maxout_pieces = 3 +dropout = null \ No newline at end of file diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 5b369d782..c305c015c 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -67,3 +67,4 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index 8e5c3a276..eca6a22fa 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -37,6 +37,7 @@ nM = 64 nC = 8 rows = 2000 columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] +dropout = null [nlp.pipeline.tok2vec.model.extract.features] @architectures = "spacy.Doc2Feats.v1" diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index 149b8ea66..a5fa32b18 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -45,3 +45,4 @@ maxout_pieces = 3 window_size = 1 subword_features = true pretrained_vectors = null +dropout = null From f1f9c8b417985cd6fcd9c38edbf87ddb5cee0ea3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Jun 2020 14:03:43 +0200 Subject: [PATCH 169/187] Port train CLI updates Updates from #5362 and fix from #5387: * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status --- spacy/cli/train.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index da3d1d5a6..04f39ca8f 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -458,22 +458,25 @@ def train( cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) - with use_ops("numpy"): - nlp_loaded = util.load_model_from_path(epoch_model_path) - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_dataset = list( - corpus.dev_dataset( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, + # Evaluate on CPU in the first iteration only (for + # timing) when GPU is enabled + if i == 0: + with use_ops("numpy"): + nlp_loaded = util.load_model_from_path(epoch_model_path) + for name, component in nlp_loaded.pipeline: + if hasattr(component, "cfg"): + component.cfg["beam_width"] = beam_width + dev_dataset = list( + corpus.dev_dataset( + nlp_loaded, + gold_preproc=gold_preproc, + ignore_misaligned=True, + ) ) - ) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) - end_time = timer() - cpu_wps = nwords / (end_time - start_time) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) + end_time = timer() + cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / f"model{i}" / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) @@ -550,7 +553,7 @@ def train( ) break except Exception as e: - msg.warn(f"Aborting and saving final best model. Encountered exception: {e}") + msg.warn(f"Aborting and saving final best model. Encountered exception: {e}", exits=1) finally: best_pipes = nlp.pipe_names if disabled_pipes: From 10d938f2217bf0a01d8a5df7ba730c11b9404aa2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Jun 2020 14:15:50 +0200 Subject: [PATCH 170/187] Update default cfg dir in train CLI --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 04f39ca8f..d4010c43b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -112,7 +112,7 @@ def train( eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] - default_dir = Path(__file__).parent.parent / "ml" / "models" / "defaults" + default_dir = Path(__file__).parent.parent / "pipeline" / "defaults" # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If From 1d8168d1fd8220ecd27dd6fbc8d604572d0b040b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Jun 2020 14:15:58 +0200 Subject: [PATCH 171/187] Fix problems with lower and whitespace in variants Port relevant changes from #5361: * Initialize lower flag explicitly * Handle whitespace words from GoldParse correctly when creating raw text with orth variants --- spacy/gold.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5aa7da456..4d564d8f6 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -362,6 +362,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): if not example.token_annotation: return example raw = example.text + lower = False if random.random() >= 0.5: lower = True if raw is not None: @@ -429,8 +430,11 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): raw_idx += 1 for word in variant_example.token_annotation.words: match_found = False + # skip whitespace words + if word.isspace(): + match_found = True # add identical word - if word not in variants and raw[raw_idx:].startswith(word): + elif word not in variants and raw[raw_idx:].startswith(word): variant_raw += word raw_idx += len(word) match_found = True @@ -445,6 +449,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: + print("aborting") return example # add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): From b0ee76264b5a80d63d62e93aad40bddace3489dc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Jun 2020 14:20:09 +0200 Subject: [PATCH 172/187] Remove debugging --- spacy/gold.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 4d564d8f6..27f9f6553 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -449,7 +449,6 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): # something went wrong, abort # (add a warning message?) if not match_found: - print("aborting") return example # add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): From 4e0610d0d46e6cd1af0375c745190b8713b6c44e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 Jun 2020 14:37:09 +0200 Subject: [PATCH 173/187] Update warning codes --- spacy/morphology.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 31d83244c..399ce406e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ def _normalize_props(props): elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value else: - warnings.warn(Warnings.W095.format(feature={key: value})) + warnings.warn(Warnings.100.format(feature={key: value})) return out @@ -112,7 +112,7 @@ cdef class Morphology: return tag_ptr.key features = self.feats_to_dict(features) if not isinstance(features, dict): - warnings.warn(Warnings.W095.format(feature=features)) + warnings.warn(Warnings.W100.format(feature=features)) features = {} features = _normalize_props(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} From a8875d4a4b7eb47baa7bb0bbd8b643aa258ac23b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 Jun 2020 14:42:39 +0200 Subject: [PATCH 174/187] Fix typo --- spacy/morphology.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 399ce406e..3e369fb3e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ def _normalize_props(props): elif isinstance(key, (int, str)) and isinstance(value, (int, str)): out[key] = value else: - warnings.warn(Warnings.100.format(feature={key: value})) + warnings.warn(Warnings.W100.format(feature={key: value})) return out From ffe0451d0972ec209556dc7aad356deca1cbe0a7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 14:45:00 +0200 Subject: [PATCH 175/187] pretrain from config --- examples/experiments/onto-joint/pretrain.cfg | 144 +++++++++++++++ spacy/_ml.py | 0 spacy/cli/pretrain.py | 179 +++++++------------ spacy/errors.py | 2 - spacy/ml/models/multi_task.py | 84 ++++++++- 5 files changed, 286 insertions(+), 123 deletions(-) create mode 100644 examples/experiments/onto-joint/pretrain.cfg delete mode 100644 spacy/_ml.py diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg new file mode 100644 index 000000000..6a41cc677 --- /dev/null +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -0,0 +1,144 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 0 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.1 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 400 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +# These settings are invalid for the transformer models. +init_tok2vec = null +vectors = null +discard_oversize = false + +[training.batch_size] +@schedules = "compounding.v1" +start = 1000 +stop = 1000 +compound = 1.001 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[pretraining] +max_epochs = 100 +min_length = 5 +max_length = 500 +dropout = 0.2 +n_save_every = null +batch_size = 3000 + +[pretraining.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true +dropout = null + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[pretraining.loss_func] +@losses = "CosineDistance.v1" + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.senter] +factory = "senter" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.senter.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.senter.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +subword_features = true +dropout = null diff --git a/spacy/_ml.py b/spacy/_ml.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index b2e3229ee..0022a0d07 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -3,48 +3,36 @@ import numpy import time import re from collections import Counter +import plac from pathlib import Path -from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu -from thinc.api import CosineDistance, L2Distance +from thinc.api import Linear, Maxout, chain, list2array from wasabi import msg import srsly +from thinc.api import use_pytorch_for_gpu_memory -from ..gold import Example from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc from ..attrs import ID, HEAD -from ..ml.models.tok2vec import build_Tok2Vec_model from .. import util -from ..util import create_default_optimizer -from .train import _load_pretrained_tok2vec +from ..gold import Example -def pretrain( +@plac.annotations( # fmt: off - texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), - vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str), - output_dir: ("Directory to write models to on each epoch", "positional", None, str), - width: ("Width of CNN layers", "option", "cw", int) = 96, - conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4, - bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0, - cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3, - sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0, - use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False, - cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1, - embed_rows: ("Number of embedding rows", "option", "er", int) = 2000, - loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine", - use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False, - dropout: ("Dropout rate", "option", "d", float) = 0.2, - n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000, - batch_size: ("Number of words per training batch", "option", "bs", int) = 3000, - max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500, - min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5, - seed: ("Seed for random number generators", "option", "s", int) = 0, - n_save_every: ("Save model every X batches.", "option", "se", int) = None, - init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None, + texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), + vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), + config_path=("Path to config file", "positional", None, Path), + output_dir=("Directory to write models to on each epoch", "positional", None, Path), + use_gpu=("Use GPU", "option", "g", int), # fmt: on +) +def pretrain( + texts_loc, + vectors_model, + config_path, + output_dir, + use_gpu=-1, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -58,23 +46,24 @@ def pretrain( However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure - all settings are the same between pretraining and training. The API and - errors around this need some improvement. + all settings are the same between pretraining and training. Ideally, + this is done by using the same config file for both commands. """ - config = dict(locals()) - for key in config: - if isinstance(config[key], Path): - config[key] = str(config[key]) - util.fix_random_seed(seed) + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) - has_gpu = prefer_gpu() - if has_gpu: - import torch + if use_gpu >= 0: + msg.info("Using GPU") + util.use_gpu(use_gpu) + else: + msg.info("Using CPU") - torch.set_default_tensor_type("torch.cuda.FloatTensor") - msg.info("Using GPU" if has_gpu else "Not using GPU") + msg.info(f"Loading config from: {config_path}") + config = util.load_config(config_path, create_objects=False) + util.fix_random_seed(config["training"]["seed"]) + if config["training"]["use_pytorch_for_gpu_memory"]: + use_pytorch_for_gpu_memory() - output_dir = Path(output_dir) if output_dir.exists() and [p for p in output_dir.iterdir()]: msg.warn( "Output directory is not empty", @@ -85,7 +74,10 @@ def pretrain( output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") srsly.write_json(output_dir / "config.json", config) - msg.good("Saved settings to config.json") + msg.good("Saved config file in the output directory") + + config = util.load_config(config_path, create_objects=True) + pretrain_config = config["pretraining"] # Load texts from file or stdin if texts_loc != "-": # reading from a file @@ -105,49 +97,11 @@ def pretrain( with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) msg.good(f"Loaded model '{vectors_model}'") - pretrained_vectors = None if not use_vectors else nlp.vocab.vectors - model = create_pretraining_model( - nlp, - # TODO: replace with config - build_Tok2Vec_model( - width, - embed_rows, - conv_depth=conv_depth, - pretrained_vectors=pretrained_vectors, - bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. - subword_features=not use_chars, # Set to False for Chinese etc - maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. - window_size=1, - char_embed=False, - nM=64, - nC=8, - ), - ) - # Load in pretrained weights - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") - # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) - if model_name: - # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_start = int(model_name.group(0)[5:][:-4]) + 1 - else: - if not epoch_start: - msg.fail( - "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec", - exits=True, - ) - elif epoch_start < 0: - msg.fail( - f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid", - exits=True, - ) - else: - # Without '--init-tok2vec' the '--epoch-start' argument is ignored - epoch_start = 0 + tok2vec = pretrain_config["model"] + model = create_pretraining_model(nlp, tok2vec) + optimizer = pretrain_config["optimizer"] - optimizer = create_default_optimizer() + epoch_start = 0 # TODO tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} @@ -168,28 +122,25 @@ def pretrain( file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 - for epoch in range(epoch_start, n_iter + epoch_start): - for batch_id, batch in enumerate( - util.minibatch_by_words( - (Example(doc=text) for text in texts), size=batch_size - ) - ): + loss_func = pretrain_config["loss_func"] + for epoch in range(epoch_start, pretrain_config["max_epochs"]): + examples = [Example(doc=text) for text in texts] + batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"]) + for batch_id, batch in enumerate(batches): docs, count = make_docs( nlp, - [text for (text, _) in batch], - max_length=max_length, - min_length=min_length, + [ex.doc for ex in batch], + max_length=pretrain_config["max_length"], + min_length=pretrain_config["min_length"], ) skip_counter += count - loss = make_update( - model, docs, optimizer, objective=loss_func, drop=dropout - ) + loss = make_update(model, docs, optimizer, distance=loss_func) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - if n_save_every and (batch_id % n_save_every == 0): + if pretrain_config["n_save_every"] and (batch_id % pretrain_config["n_save_every"] == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 @@ -201,17 +152,17 @@ def pretrain( msg.good("Successfully finished pretrain") -def make_update(model, docs, optimizer, drop=0.0, objective="L2"): +def make_update(model, docs, optimizer, distance): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. - drop (float): The dropout rate. optimizer (callable): An optimizer. RETURNS loss: A float for the loss. """ - predictions, backprop = model.begin_update(docs, drop=drop) - loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) - backprop(gradients, sgd=optimizer) + predictions, backprop = model.begin_update(docs) + loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance) + backprop(gradients) + model.finish_update(optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss @@ -243,12 +194,12 @@ def make_docs(nlp, batch, min_length, max_length): heads = numpy.asarray(heads, dtype="uint64") heads = heads.reshape((len(doc), 1)) doc = doc.from_array([HEAD], heads) - if len(doc) >= min_length and len(doc) < max_length: + if min_length <= len(doc) < max_length: docs.append(doc) return docs, skip_count -def get_vectors_loss(ops, docs, prediction, objective="L2"): +def get_vectors_loss(ops, docs, prediction, distance): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -262,13 +213,6 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - # TODO: this code originally didn't normalize, but shouldn't normalize=True ? - if objective == "L2": - distance = L2Distance(normalize=False) - elif objective == "cosine": - distance = CosineDistance(normalize=False) - else: - raise ValueError(Errors.E142.format(loss_func=objective)) d_target, loss = distance(prediction, target) return loss, d_target @@ -281,7 +225,7 @@ def create_pretraining_model(nlp, tok2vec): """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size) + Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match @@ -289,11 +233,12 @@ def create_pretraining_model(nlp, tok2vec): # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) - model = build_masked_language_model(nlp.vocab, model) - model.set_ref("tok2vec", tok2vec) - model.set_ref("output_layer", output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) - return model + mlm_model = build_masked_language_model(nlp.vocab, model) + mlm_model.set_ref("tok2vec", tok2vec) + mlm_model.set_ref("output_layer", output_layer) + mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + return mlm_model class ProgressTracker(object): diff --git a/spacy/errors.py b/spacy/errors.py index 852c55225..96b323ef5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -441,8 +441,6 @@ class Errors(object): "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") - E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or " - "'cosine'.") E143 = ("Labels for component '{name}' not initialized. Did you forget to " "call add_label()?") E144 = ("Could not find parameter `{param}` when building the entity " diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 1c193df82..970d31899 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,4 +1,6 @@ -from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init +import numpy + +from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): @@ -24,6 +26,80 @@ def build_cloze_multi_task_model(vocab, tok2vec): return model -def build_masked_language_model(*args, **kwargs): - # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828 - raise NotImplementedError +def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): + """Convert a model into a BERT-style masked language model""" + + random_words = _RandomWords(vocab) + + def mlm_forward(model, docs, is_train): + mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) + mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) + output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + + def mlm_backward(d_output): + d_output *= 1 - mask + return backprop(d_output) + + return output, mlm_backward + + mlm_model = Model("masked-language-model", mlm_forward, layers=[wrapped_model]) + mlm_model.set_ref("wrapped-model", wrapped_model) + + return mlm_model + + +class _RandomWords(object): + def __init__(self, vocab): + self.words = [lex.text for lex in vocab if lex.prob != 0.0] + self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] + self.words = self.words[:10000] + self.probs = self.probs[:10000] + self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) + self.probs /= self.probs.sum() + self._cache = [] + + def next(self): + if not self._cache: + self._cache.extend( + numpy.random.choice(len(self.words), 10000, p=self.probs) + ) + index = self._cache.pop() + return self.words[index] + + +def _apply_mask(docs, random_words, mask_prob=0.15): + # This needs to be here to avoid circular imports + from ...tokens import Doc + + N = sum(len(doc) for doc in docs) + mask = numpy.random.uniform(0.0, 1.0, (N,)) + mask = mask >= mask_prob + i = 0 + masked_docs = [] + for doc in docs: + words = [] + for token in doc: + if not mask[i]: + word = _replace_word(token.text, random_words) + else: + word = token.text + words.append(word) + i += 1 + spaces = [bool(w.whitespace_) for w in doc] + # NB: If you change this implementation to instead modify + # the docs in place, take care that the IDs reflect the original + # words. Currently we use the original docs to make the vectors + # for the target, so we don't lose the original tokens. But if + # you modified the docs in place here, you would. + masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) + return mask, masked_docs + + +def _replace_word(word, random_words, mask="[MASK]"): + roll = numpy.random.random() + if roll < 0.8: + return mask + elif roll < 0.9: + return random_words.next() + else: + return word \ No newline at end of file From ddf8244df954972d81b22449c149d1b79964b2cf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 14:52:54 +0200 Subject: [PATCH 176/187] add normalize option to distance metric --- examples/experiments/onto-joint/pretrain.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 6a41cc677..87501fb16 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -76,6 +76,7 @@ learn_rate = 0.001 [pretraining.loss_func] @losses = "CosineDistance.v1" +normalize = true [nlp] lang = "en" From 4ed6278663c9482e14b549b2079f02cc186bc078 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 19:32:40 +0200 Subject: [PATCH 177/187] small fixes to pretrain config, init_tok2vec TODO --- examples/experiments/onto-joint/pretrain.cfg | 6 +++- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 +- spacy/cli/pretrain.py | 34 ++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 87501fb16..f1de3eab9 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -45,12 +45,16 @@ eps = 1e-8 learn_rate = 0.001 [pretraining] -max_epochs = 100 +max_epochs = 1000 +start_epoch = 0 min_length = 5 max_length = 500 dropout = 0.2 n_save_every = null batch_size = 3000 +seed = ${training:seed} +use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} +init_tok2vec = null [pretraining.model] @architectures = "spacy.HashEmbedCNN.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 52faad9ec..acbcc8d41 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -66,4 +66,4 @@ depth = 4 embed_size = 2000 subword_features = true maxout_pieces = 3 -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0022a0d07..d6f4d484c 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -16,14 +16,15 @@ from ..tokens import Doc from ..attrs import ID, HEAD from .. import util from ..gold import Example +from .deprecated_pretrain import _load_pretrained_tok2vec # TODO @plac.annotations( # fmt: off texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), - config_path=("Path to config file", "positional", None, Path), output_dir=("Directory to write models to on each epoch", "positional", None, Path), + config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), # fmt: on ) @@ -60,8 +61,8 @@ def pretrain( msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) - util.fix_random_seed(config["training"]["seed"]) - if config["training"]["use_pytorch_for_gpu_memory"]: + util.fix_random_seed(config["pretraining"]["seed"]) + if config["pretraining"]["use_pytorch_for_gpu_memory"]: use_pytorch_for_gpu_memory() if output_dir.exists() and [p for p in output_dir.iterdir()]: @@ -100,8 +101,33 @@ def pretrain( tok2vec = pretrain_config["model"] model = create_pretraining_model(nlp, tok2vec) optimizer = pretrain_config["optimizer"] + init_tok2vec = pretrain_config["init_tok2vec"] + epoch_start = pretrain_config["epoch_start"] + + # Load in pretrained weights - TODO test + if init_tok2vec is not None: + components = _load_pretrained_tok2vec(nlp, init_tok2vec) + msg.text(f"Loaded pretrained tok2vec for: {components}") + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_start = int(model_name.group(0)[5:][:-4]) + 1 + else: + if not epoch_start: + msg.fail( + "You have to use the epoch_start setting when using a renamed weight file for init_tok2vec", + exits=True, + ) + elif epoch_start < 0: + msg.fail( + f"The setting epoch_start has to be greater or equal to 0. {epoch_start} is invalid", + exits=True, + ) + else: + # Without 'init-tok2vec' the 'epoch_start' setting is ignored + epoch_start = 0 - epoch_start = 0 # TODO tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} From 07886a3de35f6a9188c6a2963c45e4fbda138004 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 22:00:25 +0200 Subject: [PATCH 178/187] rename init_tok2vec to resume --- examples/experiments/onto-joint/pretrain.cfg | 2 - spacy/cli/pretrain.py | 61 ++++++++++++-------- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index f1de3eab9..1637cceae 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -46,7 +46,6 @@ learn_rate = 0.001 [pretraining] max_epochs = 1000 -start_epoch = 0 min_length = 5 max_length = 500 dropout = 0.2 @@ -54,7 +53,6 @@ n_save_every = null batch_size = 3000 seed = ${training:seed} use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} -init_tok2vec = null [pretraining.model] @architectures = "spacy.HashEmbedCNN.v1" diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index d6f4d484c..0a04de101 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -16,7 +16,6 @@ from ..tokens import Doc from ..attrs import ID, HEAD from .. import util from ..gold import Example -from .deprecated_pretrain import _load_pretrained_tok2vec # TODO @plac.annotations( @@ -26,7 +25,10 @@ from .deprecated_pretrain import _load_pretrained_tok2vec # TODO output_dir=("Directory to write models to on each epoch", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), - # fmt: on + resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), + epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), + +# fmt: on ) def pretrain( texts_loc, @@ -34,6 +36,8 @@ def pretrain( config_path, output_dir, use_gpu=-1, + resume_path=None, + epoch_resume=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -66,11 +70,19 @@ def pretrain( use_pytorch_for_gpu_memory() if output_dir.exists() and [p for p in output_dir.iterdir()]: - msg.warn( - "Output directory is not empty", - "It is better to use an empty directory or refer to a new output path, " - "then the new directory will be created for you.", - ) + if resume_path: + msg.warn( + "Output directory is not empty. ", + "If you're resuming a run from a previous " + "model, the old models for the consecutive epochs will be overwritten " + "with the new ones.", + ) + else: + msg.warn( + "Output directory is not empty. ", + "It is better to use an empty directory or refer to a new output path, " + "then the new directory will be created for you.", + ) if not output_dir.exists(): output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") @@ -92,7 +104,7 @@ def pretrain( msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin - msg.text("Reading input text from stdin...") + msg.info("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading(f"Loading model '{vectors_model}'..."): @@ -101,35 +113,36 @@ def pretrain( tok2vec = pretrain_config["model"] model = create_pretraining_model(nlp, tok2vec) optimizer = pretrain_config["optimizer"] - init_tok2vec = pretrain_config["init_tok2vec"] - epoch_start = pretrain_config["epoch_start"] - # Load in pretrained weights - TODO test - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") + # Load in pretrained weights to resume from + if resume_path is not None: + msg.info(f"Resume training tok2vec from: {resume_path}") + with resume_path.open("rb") as file_: + weights_data = file_.read() + model.get_ref("tok2vec").from_bytes(weights_data) # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) + model_name = re.search(r"model\d+\.bin", str(resume_path)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_start = int(model_name.group(0)[5:][:-4]) + 1 + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + msg.info(f"Resuming from epoch: {epoch_resume}") else: - if not epoch_start: + if not epoch_resume: msg.fail( - "You have to use the epoch_start setting when using a renamed weight file for init_tok2vec", + "You have to use the --epoch_resume setting when using a renamed weight file for --resume_path", exits=True, ) - elif epoch_start < 0: + elif epoch_resume < 0: msg.fail( - f"The setting epoch_start has to be greater or equal to 0. {epoch_start} is invalid", + f"The setting --epoch_resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, ) else: - # Without 'init-tok2vec' the 'epoch_start' setting is ignored - epoch_start = 0 + # Without 'resume_path' the 'epoch_resume' setting is ignored + epoch_resume = 0 tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) @@ -149,7 +162,7 @@ def pretrain( skip_counter = 0 loss_func = pretrain_config["loss_func"] - for epoch in range(epoch_start, pretrain_config["max_epochs"]): + for epoch in range(epoch_resume, pretrain_config["max_epochs"]): examples = [Example(doc=text) for text in texts] batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"]) for batch_id, batch in enumerate(batches): From 1775f54a2627ccad23b81d64e74c19777f71057f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 22:17:02 +0200 Subject: [PATCH 179/187] small little fixes --- spacy/cli/pretrain.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0a04de101..96564b98b 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,10 +5,9 @@ import re from collections import Counter import plac from pathlib import Path -from thinc.api import Linear, Maxout, chain, list2array +from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory from wasabi import msg import srsly -from thinc.api import use_pytorch_for_gpu_memory from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model @@ -73,8 +72,8 @@ def pretrain( if resume_path: msg.warn( "Output directory is not empty. ", - "If you're resuming a run from a previous " - "model, the old models for the consecutive epochs will be overwritten " + "If you're resuming a run from a previous model in this directory, " + "the old models for the consecutive epochs will be overwritten " "with the new ones.", ) else: @@ -129,16 +128,18 @@ def pretrain( else: if not epoch_resume: msg.fail( - "You have to use the --epoch_resume setting when using a renamed weight file for --resume_path", + "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path", exits=True, ) elif epoch_resume < 0: msg.fail( - f"The setting --epoch_resume has to be greater or equal to 0. {epoch_resume} is invalid", + f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, ) + else: + msg.info(f"Resuming from epoch: {epoch_resume}") else: - # Without 'resume_path' the 'epoch_resume' setting is ignored + # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 tracker = ProgressTracker(frequency=10000) From 6b027d76893de1b535f17a9b2848aba93bb2bb41 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 4 Jun 2020 15:49:23 +0200 Subject: [PATCH 180/187] remove duplicate model definition of tok2vec layer --- examples/experiments/onto-joint/pretrain.cfg | 12 +----------- spacy/cli/pretrain.py | 5 ++++- spacy/ml/models/multi_task.py | 2 +- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 1637cceae..4f1898d69 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -53,17 +53,7 @@ n_save_every = null batch_size = 3000 seed = ${training:seed} use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} - -[pretraining.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${nlp:vectors} -width = 256 -depth = 6 -window_size = 1 -embed_size = 2000 -maxout_pieces = 3 -subword_features = true -dropout = null +tok2vec_model = "nlp.pipeline.tok2vec.model" [pretraining.optimizer] @optimizers = "Adam.v1" diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 96564b98b..921eb38ab 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -109,7 +109,10 @@ def pretrain( with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) msg.good(f"Loaded model '{vectors_model}'") - tok2vec = pretrain_config["model"] + tok2vec_path = pretrain_config["tok2vec_model"] + tok2vec = config + for subpath in tok2vec_path.split("."): + tok2vec = tok2vec.get(subpath) model = create_pretraining_model(nlp, tok2vec) optimizer = pretrain_config["optimizer"] diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 970d31899..8000d1aff 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -102,4 +102,4 @@ def _replace_word(word, random_words, mask="[MASK]"): elif roll < 0.9: return random_words.next() else: - return word \ No newline at end of file + return word From 776d4f11909d796963068a9a931fdca8b71a8ccc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 4 Jun 2020 16:07:30 +0200 Subject: [PATCH 181/187] cleanup --- spacy/cli/train_from_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 9cdc3bf2f..a6d0a0abc 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -76,7 +76,7 @@ maxout_pieces = 3 subword_features = true """ -# TODO: REMOVE ? + class PipelineComponent(BaseModel): factory: str model: Model @@ -84,7 +84,7 @@ class PipelineComponent(BaseModel): class Config: arbitrary_types_allowed = True -# TODO: REMOVE ? + class ConfigSchema(BaseModel): optimizer: Optional["Optimizer"] From 3ade455fd35eb14bf59f4d0276c1699323a947a3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 4 Jun 2020 16:09:55 +0200 Subject: [PATCH 182/187] formatting --- spacy/cli/pretrain.py | 7 ++++--- spacy/ml/__init__.py | 2 +- spacy/pipeline/defaults/entity_linker_defaults.cfg | 2 +- spacy/pipeline/defaults/morphologizer_defaults.cfg | 2 +- spacy/pipeline/defaults/textcat_defaults.cfg | 2 +- spacy/pipeline/defaults/tok2vec_defaults.cfg | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 921eb38ab..d37426b5a 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -26,8 +26,7 @@ from ..gold import Example use_gpu=("Use GPU", "option", "g", int), resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), - -# fmt: on + # fmt: on ) def pretrain( texts_loc, @@ -183,7 +182,9 @@ def pretrain( msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - if pretrain_config["n_save_every"] and (batch_id % pretrain_config["n_save_every"] == 0): + if pretrain_config["n_save_every"] and ( + batch_id % pretrain_config["n_save_every"] == 0 + ): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index cf4f59d6c..aed4fa323 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1 +1 @@ -from .models import * \ No newline at end of file +from .models import * diff --git a/spacy/pipeline/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg index 26a294f37..8dddf9e7b 100644 --- a/spacy/pipeline/defaults/entity_linker_defaults.cfg +++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg @@ -10,4 +10,4 @@ embed_size = 300 window_size = 1 maxout_pieces = 3 subword_features = true -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/pipeline/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg index c4452c689..6ee053a08 100644 --- a/spacy/pipeline/defaults/morphologizer_defaults.cfg +++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg @@ -11,4 +11,4 @@ window_size = 1 maxout_pieces = 3 nM = 64 nC = 8 -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/pipeline/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg index e5817de4a..0981cf77c 100644 --- a/spacy/pipeline/defaults/textcat_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_defaults.cfg @@ -7,4 +7,4 @@ conv_depth = 2 embed_size = 2000 window_size = 1 ngram_size = 1 -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/pipeline/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg index 36bf0c3da..d2718eed1 100644 --- a/spacy/pipeline/defaults/tok2vec_defaults.cfg +++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg @@ -7,4 +7,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true -dropout = null \ No newline at end of file +dropout = null From d93cbeb14fe71e8e17445224241e0d26f0223dbd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 5 Jun 2020 03:42:15 -0700 Subject: [PATCH 183/187] Add warning for loose version constraints (#5536) * Add warning for loose version constraints * Update wording [ci skip] * Tweak error message Co-authored-by: Matthew Honnibal --- spacy/errors.py | 6 ++++++ spacy/tests/test_misc.py | 18 ++++++++++++++++ spacy/util.py | 46 ++++++++++++++++++++++++++++++++++------ 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index ce931f0a1..94a0218a7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -113,6 +113,12 @@ class Warnings(object): "ignored during training.") # TODO: fix numbering after merging develop into master + W094 = ("Model '{model}' ({model_version}) specifies an under-constrained " + "spaCy version requirement: {version}. This can lead to compatibility " + "problems with older versions, or as new spaCy versions are " + "released, because the model may say it's compatible when it's " + 'not. Consider changing the "spacy_version" in your meta.json to a ' + "version range, with a lower and upper pin. For example: {example}") W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is " "incompatible with the current version ({current}). This may lead " "to unexpected results or runtime errors. To resolve this, " diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index e4b4e570c..4e6c0e652 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -109,3 +109,21 @@ def test_ascii_filenames(): ) def test_is_compatible_version(version, constraint, compatible): assert util.is_compatible_version(version, constraint) is compatible + + +@pytest.mark.parametrize( + "constraint,expected", + [ + ("3.0.0", False), + ("==3.0.0", False), + (">=2.3.0", True), + (">2.0.0", True), + ("<=2.0.0", True), + (">2.0.0,<3.0.0", False), + (">=2.0.0,<3.0.0", False), + ("!=1.1,>=1.0,~=1.0", True), + ("n/a", None), + ], +) +def test_is_unconstrained_version(constraint, expected): + assert util.is_unconstrained_version(constraint) is expected diff --git a/spacy/util.py b/spacy/util.py index 97cc5a8d7..bc6c98a82 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -264,6 +264,31 @@ def is_compatible_version(version, constraint, prereleases=True): return version in spec +def is_unconstrained_version(constraint, prereleases=True): + # We have an exact version, this is the ultimate constrained version + if constraint[0].isdigit(): + return False + try: + spec = SpecifierSet(constraint) + except InvalidSpecifier: + return None + spec.prereleases = prereleases + specs = [sp for sp in spec] + # We only have one version spec and it defines > or >= + if len(specs) == 1 and specs[0].operator in (">", ">="): + return True + # One specifier is exact version + if any(sp.operator in ("==") for sp in specs): + return False + has_upper = any(sp.operator in ("<", "<=") for sp in specs) + has_lower = any(sp.operator in (">", ">=") for sp in specs) + # We have a version spec that defines an upper and lower bound + if has_upper and has_lower: + return False + # Everything else, like only an upper version, only a lower version etc. + return True + + def get_model_version_range(spacy_version): """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy version. Models are always compatible across patch versions but not @@ -334,14 +359,21 @@ def get_model_meta(path): raise ValueError(Errors.E054.format(setting=setting)) if "spacy_version" in meta: if not is_compatible_version(about.__version__, meta["spacy_version"]): - warnings.warn( - Warnings.W095.format( - model=f"{meta['lang']}_{meta['name']}", - model_version=meta["version"], - version=meta["spacy_version"], - current=about.__version__, - ) + warn_msg = Warnings.W095.format( + model=f"{meta['lang']}_{meta['name']}", + model_version=meta["version"], + version=meta["spacy_version"], + current=about.__version__, ) + warnings.warn(warn_msg) + if is_unconstrained_version(meta["spacy_version"]): + warn_msg = Warnings.W094.format( + model=f"{meta['lang']}_{meta['name']}", + model_version=meta["version"], + version=meta["spacy_version"], + example=get_model_version_range(about.__version__), + ) + warnings.warn(warn_msg) return meta From c0f4a1e43b5210ea631809ea590b486bca066d25 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 12 Jun 2020 02:02:07 +0200 Subject: [PATCH 184/187] train is from-config by default (#5575) * verbose and tag_map options * adding init_tok2vec option and only changing the tok2vec that is specified * adding omit_extra_lookups and verifying textcat config * wip * pretrain bugfix * add replace and resume options * train_textcat fix * raw text functionality * improve UX when KeyError or when input data can't be parsed * avoid unnecessary access to goldparse in TextCat pipe * save performance information in nlp.meta * add noise_level to config * move nn_parser's defaults to config file * multitask in config - doesn't work yet * scorer offering both F and AUC options, need to be specified in config * add textcat verification code from old train script * small fixes to config files * clean up * set default config for ner/parser to allow create_pipe to work as before * two more test fixes * small fixes * cleanup * fix NER pickling + additional unit test * create_pipe as before --- examples/experiments/onto-joint/defaults.cfg | 13 +- examples/experiments/onto-joint/pretrain.cfg | 12 +- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 5 + .../ptb-joint-pos-dep/defaults.cfg | 5 + examples/training/train_textcat.py | 17 +- spacy/cli/evaluate.py | 3 +- spacy/cli/pretrain.py | 10 +- spacy/cli/train.py | 773 ------------------ spacy/cli/train_from_config.py | 262 +++++- spacy/errors.py | 9 +- spacy/gold.pyx | 12 + spacy/language.py | 13 +- spacy/ml/models/multi_task.py | 19 +- spacy/ml/models/textcat.py | 3 + .../pipeline/defaults/multitask_defaults.cfg | 15 + spacy/pipeline/pipes.pyx | 81 +- spacy/pipeline/tok2vec.py | 2 +- spacy/scorer.py | 106 +-- spacy/syntax/nn_parser.pyx | 15 +- spacy/tests/doc/test_add_entities.py | 6 +- spacy/tests/parser/test_add_label.py | 8 +- spacy/tests/parser/test_arc_eager_oracle.py | 3 +- spacy/tests/parser/test_ner.py | 9 +- spacy/tests/parser/test_neural_parser.py | 3 +- spacy/tests/parser/test_nn_beam.py | 3 +- spacy/tests/parser/test_preset_sbd.py | 3 +- spacy/tests/regression/test_issue1501-2000.py | 3 +- spacy/tests/regression/test_issue3001-3500.py | 3 +- spacy/tests/regression/test_issue3830.py | 6 +- spacy/tests/regression/test_issue4042.py | 3 +- spacy/tests/regression/test_issue4313.py | 3 +- spacy/tests/regression/test_issue4725.py | 20 +- .../serialize/test_serialize_pipeline.py | 3 +- spacy/util.py | 13 +- website/docs/api/scorer.md | 30 +- 35 files changed, 522 insertions(+), 972 deletions(-) delete mode 100644 spacy/cli/train.py create mode 100644 spacy/pipeline/defaults/multitask_defaults.cfg diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 6c3a21f4b..f76336d84 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -9,6 +9,7 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 +noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 @@ -24,8 +25,8 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"] score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null -vectors = null discard_oversize = false +omit_extra_lookups = false [training.batch_size] @schedules = "compounding.v1" @@ -52,7 +53,7 @@ learn_rate = 0.001 [nlp] lang = "en" -vectors = ${training:vectors} +vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" @@ -62,12 +63,20 @@ factory = "senter" [nlp.pipeline.ner] factory = "ner" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger] factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.senter.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 4f1898d69..40885b6e8 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -9,6 +9,7 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 +noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 @@ -24,7 +25,6 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"] score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null -vectors = null discard_oversize = false [training.batch_size] @@ -72,7 +72,7 @@ normalize = true [nlp] lang = "en" -vectors = ${training:vectors} +vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" @@ -82,12 +82,20 @@ factory = "senter" [nlp.pipeline.ner] factory = "ner" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger] factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.senter.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index acbcc8d41..905b5b4e0 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -6,6 +6,7 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 +noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = 0 @@ -40,6 +41,10 @@ factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index c305c015c..7383116e7 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -6,6 +6,7 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 +noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = -1 @@ -40,6 +41,10 @@ factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 65acadb07..c5e679467 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -120,13 +120,22 @@ def load_data(dataset, threshold, limit=0, split=0.8): random.shuffle(train_data) texts, labels = zip(*train_data) - unique_labels = sorted(set([l for label_set in labels for l in label_set])) + unique_labels = set() + for label_set in labels: + if isinstance(label_set, int) or isinstance(label_set, str): + unique_labels.add(label_set) + elif isinstance(label_set, list) or isinstance(label_set, set): + unique_labels.update(label_set) + unique_labels = sorted(unique_labels) print(f"# of unique_labels: {len(unique_labels)}") count_values_train = dict() for text, annot_list in train_data: - for annot in annot_list: - count_values_train[annot] = count_values_train.get(annot, 0) + 1 + if isinstance(annot_list, int) or isinstance(annot_list, str): + count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1 + else: + for annot in annot_list: + count_values_train[annot] = count_values_train.get(annot, 0) + 1 for value, count in sorted(count_values_train.items(), key=lambda item: item[1]): if count < threshold: unique_labels.remove(value) @@ -138,7 +147,7 @@ def load_data(dataset, threshold, limit=0, split=0.8): else: cats = [] for y in labels: - if isinstance(y, str): + if isinstance(y, str) or isinstance(y, int): cats.append({str(label): (label == y) for label in unique_labels}) elif isinstance(y, set): cats.append({str(label): (label in y) for label in unique_labels}) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 735e304f9..bae252b1c 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -54,7 +54,8 @@ def evaluate( "NER P": f"{scorer.ents_p:.2f}", "NER R": f"{scorer.ents_r:.2f}", "NER F": f"{scorer.ents_f:.2f}", - "Textcat": f"{scorer.textcat_score:.2f}", + "Textcat AUC": f"{scorer.textcat_auc:.2f}", + "Textcat F": f"{scorer.textcat_f:.2f}", "Sent P": f"{scorer.sent_p:.2f}", "Sent R": f"{scorer.sent_r:.2f}", "Sent F": f"{scorer.sent_f:.2f}", diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index d37426b5a..4f4707b52 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -266,17 +266,15 @@ def create_pretraining_model(nlp, tok2vec): the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. + The actual tok2vec layer is stored as a reference, and only this bit will be + serialized to file and read back in when calling the 'train' command. """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size) ) - # This is annoying, but the parser etc have the flatten step after - # the tok2vec. To load the weights in cleanly, we need to match - # the shape of the models' components exactly. So what we cann - # "tok2vec" has to be the same set of processes as what the components do. - tok2vec = chain(tok2vec, list2array()) - model = chain(tok2vec, output_layer) + model = chain(tok2vec, list2array()) + model = chain(model, output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) mlm_model = build_masked_language_model(nlp.vocab, model) mlm_model.set_ref("tok2vec", tok2vec) diff --git a/spacy/cli/train.py b/spacy/cli/train.py deleted file mode 100644 index cbe977cad..000000000 --- a/spacy/cli/train.py +++ /dev/null @@ -1,773 +0,0 @@ -import os -import tqdm -from pathlib import Path -from thinc.api import use_ops -from timeit import default_timer as timer -import shutil -import srsly -from wasabi import msg -import contextlib -import random - -from ..util import create_default_optimizer -from ..util import use_gpu as set_gpu -from ..gold import GoldCorpus -from ..lookups import Lookups -from .. import util -from .. import about - - -def train( - # fmt: off - lang: ("Model language", "positional", None, str), - output_path: ("Output directory to store model in", "positional", None, Path), - train_path: ("Location of JSON-formatted training data", "positional", None, Path), - dev_path: ("Location of JSON-formatted development data", "positional", None, Path), - raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, - base_model: ("Name of model to update (optional)", "option", "b", str) = None, - pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner", - vectors: ("Model to load vectors from", "option", "v", str) = None, - replace_components: ("Replace components from base model", "flag", "R", bool) = False, - n_iter: ("Number of iterations", "option", "n", int) = 30, - n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None, - n_examples: ("Number of examples", "option", "ns", int) = 0, - use_gpu: ("Use GPU", "option", "g", int) = -1, - version: ("Model version", "option", "V", str) = "0.0.0", - meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None, - init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "", - entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "", - noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0, - orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0, - eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "", - gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, - learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False, - textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False, - textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow", - textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None, - tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, - omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, - verbose: ("Display more information for debug", "flag", "VV", bool) = False, - debug: ("Run data diagnostics before training", "flag", "D", bool) = False, - # fmt: on -): - """ - Train or update a spaCy model. Requires data to be formatted in spaCy's - JSON format. To convert data from other formats, use the `spacy convert` - command. - """ - util.fix_random_seed() - util.set_env_log(verbose) - - # Make sure all files and paths exists if they are needed - train_path = util.ensure_path(train_path) - dev_path = util.ensure_path(dev_path) - meta_path = util.ensure_path(meta_path) - output_path = util.ensure_path(output_path) - if raw_text is not None: - raw_text = list(srsly.read_jsonl(raw_text)) - if not train_path or not train_path.exists(): - msg.fail("Training data not found", train_path, exits=1) - if not dev_path or not dev_path.exists(): - msg.fail("Development data not found", dev_path, exits=1) - if meta_path is not None and not meta_path.exists(): - msg.fail("Can't find model meta.json", meta_path, exits=1) - meta = srsly.read_json(meta_path) if meta_path else {} - if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) - if not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") - - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - # Take dropout and batch size as generators of values -- dropout - # starts high and decays sharply, to force the optimizer to explore. - # Batch size starts at 1 and grows, so that we make updates quickly - # at the beginning of training. - dropout_rates = util.decaying( - util.env_opt("dropout_from", 0.2), - util.env_opt("dropout_to", 0.2), - util.env_opt("dropout_decay", 0.0), - ) - batch_sizes = util.compounding( - util.env_opt("batch_from", 100.0), - util.env_opt("batch_to", 1000.0), - util.env_opt("batch_compound", 1.001), - ) - - if not eval_beam_widths: - eval_beam_widths = [1] - else: - eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] - if 1 not in eval_beam_widths: - eval_beam_widths.append(1) - eval_beam_widths.sort() - has_beam_widths = eval_beam_widths != [1] - - default_dir = Path(__file__).parent.parent / "pipeline" / "defaults" - - # Set up the base model and pipeline. If a base model is specified, load - # the model and make sure the pipeline matches the pipeline setting. If - # training starts from a blank model, intitalize the language class. - pipeline = [p.strip() for p in pipeline.split(",")] - msg.text(f"Training pipeline: {pipeline}") - disabled_pipes = None - pipes_added = False - if use_gpu >= 0: - activated_gpu = None - try: - activated_gpu = set_gpu(use_gpu) - except Exception as e: - msg.warn(f"Exception: {e}") - if activated_gpu is not None: - msg.text(f"Using GPU: {use_gpu}") - else: - msg.warn(f"Unable to activate GPU: {use_gpu}") - msg.text("Using CPU only") - use_gpu = -1 - if base_model: - msg.text(f"Starting with base model '{base_model}'") - nlp = util.load_model(base_model) - if nlp.lang != lang: - msg.fail( - f"Model language ('{nlp.lang}') doesn't match language " - f"specified as `lang` argument ('{lang}') ", - exits=1, - ) - if vectors: - msg.text(f"Loading vectors from model '{vectors}'") - _load_vectors(nlp, vectors) - - nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline]) - for pipe in pipeline: - # first, create the model. - # Bit of a hack after the refactor to get the vectors into a default config - # use train-from-config instead :-) - if pipe == "parser": - config_loc = default_dir / "parser_defaults.cfg" - elif pipe == "tagger": - config_loc = default_dir / "tagger_defaults.cfg" - elif pipe == "ner": - config_loc = default_dir / "ner_defaults.cfg" - elif pipe == "textcat": - config_loc = default_dir / "textcat_defaults.cfg" - elif pipe == "senter": - config_loc = default_dir / "senter_defaults.cfg" - else: - raise ValueError(f"Component {pipe} currently not supported.") - pipe_cfg = util.load_config(config_loc, create_objects=False) - if vectors: - pretrained_config = { - "@architectures": "spacy.VocabVectors.v1", - "name": vectors, - } - pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config - - if pipe == "parser": - pipe_cfg["learn_tokens"] = learn_tokens - elif pipe == "textcat": - pipe_cfg["exclusive_classes"] = not textcat_multilabel - pipe_cfg["architecture"] = textcat_arch - pipe_cfg["positive_label"] = textcat_positive_label - - if pipe not in nlp.pipe_names: - msg.text(f"Adding component to base model '{pipe}'") - nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) - pipes_added = True - elif replace_components: - msg.text(f"Replacing component from base model '{pipe}'") - nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) - pipes_added = True - else: - if pipe == "textcat": - textcat_cfg = nlp.get_pipe("textcat").cfg - base_cfg = { - "exclusive_classes": textcat_cfg["exclusive_classes"], - "architecture": textcat_cfg["architecture"], - "positive_label": textcat_cfg["positive_label"], - } - if base_cfg != pipe_cfg: - msg.fail( - f"The base textcat model configuration does" - f"not match the provided training options. " - f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", - exits=1, - ) - msg.text(f"Extending component from base model '{pipe}'") - disabled_pipes = nlp.select_pipes( - disable=[p for p in nlp.pipe_names if p not in pipeline] - ) - else: - msg.text(f"Starting with blank model '{lang}'") - lang_cls = util.get_lang_class(lang) - nlp = lang_cls() - - if vectors: - msg.text(f"Loading vectors from model '{vectors}'") - _load_vectors(nlp, vectors) - - for pipe in pipeline: - # first, create the model. - # Bit of a hack after the refactor to get the vectors into a default config - # use train-from-config instead :-) - if pipe == "parser": - config_loc = default_dir / "parser_defaults.cfg" - elif pipe == "tagger": - config_loc = default_dir / "tagger_defaults.cfg" - elif pipe == "morphologizer": - config_loc = default_dir / "morphologizer_defaults.cfg" - elif pipe == "ner": - config_loc = default_dir / "ner_defaults.cfg" - elif pipe == "textcat": - config_loc = default_dir / "textcat_defaults.cfg" - elif pipe == "senter": - config_loc = default_dir / "senter_defaults.cfg" - else: - raise ValueError(f"Component {pipe} currently not supported.") - pipe_cfg = util.load_config(config_loc, create_objects=False) - if vectors: - pretrained_config = { - "@architectures": "spacy.VocabVectors.v1", - "name": vectors, - } - pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config - - if pipe == "parser": - pipe_cfg["learn_tokens"] = learn_tokens - elif pipe == "textcat": - pipe_cfg["exclusive_classes"] = not textcat_multilabel - pipe_cfg["architecture"] = textcat_arch - pipe_cfg["positive_label"] = textcat_positive_label - - pipe = nlp.create_pipe(pipe, config=pipe_cfg) - nlp.add_pipe(pipe) - - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) - - # Create empty extra lexeme tables so the data from spacy-lookups-data - # isn't loaded if these features are accessed - if omit_extra_lookups: - nlp.vocab.lookups_extra = Lookups() - nlp.vocab.lookups_extra.add_table("lexeme_cluster") - nlp.vocab.lookups_extra.add_table("lexeme_prob") - nlp.vocab.lookups_extra.add_table("lexeme_settings") - - if vectors: - msg.text("Loading vector from model '{}'".format(vectors)) - _load_vectors(nlp, vectors) - - # Multitask objectives - multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] - for pipe_name, multitasks in multitask_options: - if multitasks: - if pipe_name not in pipeline: - msg.fail( - f"Can't use multitask objective without '{pipe_name}' in " - f"the pipeline" - ) - pipe = nlp.get_pipe(pipe_name) - for objective in multitasks.split(","): - pipe.add_multitask_objective(objective) - - # Prepare training corpus - msg.text(f"Counting training words (limit={n_examples})") - corpus = GoldCorpus(train_path, dev_path, limit=n_examples) - n_train_words = corpus.count_train() - - if base_model and not pipes_added: - # Start with an existing model, use default optimizer - optimizer = create_default_optimizer() - else: - # Start with a blank model, call begin_training - cfg = {"device": use_gpu} - optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg) - nlp._optimizer = None - - # Load in pretrained weights (TODO: this may be broken in the config rewrite) - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") - - # Verify textcat config - if "textcat" in pipeline: - textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) - if textcat_positive_label and textcat_positive_label not in textcat_labels: - msg.fail( - f"The textcat_positive_label (tpl) '{textcat_positive_label}' " - f"does not match any label in the training data.", - exits=1, - ) - if textcat_positive_label and len(textcat_labels) != 2: - msg.fail( - "A textcat_positive_label (tpl) '{textcat_positive_label}' was " - "provided for training data that does not appear to be a " - "binary classification problem with two labels.", - exits=1, - ) - train_data = corpus.train_data( - nlp, - noise_level=noise_level, - gold_preproc=gold_preproc, - max_length=0, - ignore_misaligned=True, - ) - train_labels = set() - if textcat_multilabel: - multilabel_found = False - for ex in train_data: - train_labels.update(ex.gold.cats.keys()) - if list(ex.gold.cats.values()).count(1.0) != 1: - multilabel_found = True - if not multilabel_found and not base_model: - msg.warn( - "The textcat training instances look like they have " - "mutually-exclusive classes. Remove the flag " - "'--textcat-multilabel' to train a classifier with " - "mutually-exclusive classes." - ) - if not textcat_multilabel: - for ex in train_data: - train_labels.update(ex.gold.cats.keys()) - if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model: - msg.warn( - "Some textcat training instances do not have exactly " - "one positive label. Modifying training options to " - "include the flag '--textcat-multilabel' for classes " - "that are not mutually exclusive." - ) - nlp.get_pipe("textcat").cfg["exclusive_classes"] = False - textcat_multilabel = True - break - if base_model and set(textcat_labels) != train_labels: - msg.fail( - f"Cannot extend textcat model using data with different " - f"labels. Base model labels: {textcat_labels}, training data " - f"labels: {list(train_labels)}", - exits=1, - ) - if textcat_multilabel: - msg.text( - f"Textcat evaluation score: ROC AUC score macro-averaged across " - f"the labels '{', '.join(textcat_labels)}'" - ) - elif textcat_positive_label and len(textcat_labels) == 2: - msg.text( - f"Textcat evaluation score: F1-score for the " - f"label '{textcat_positive_label}'" - ) - elif len(textcat_labels) > 1: - if len(textcat_labels) == 2: - msg.warn( - "If the textcat component is a binary classifier with " - "exclusive classes, provide '--textcat-positive-label' for " - "an evaluation on the positive class." - ) - msg.text( - f"Textcat evaluation score: F1-score macro-averaged across " - f"the labels '{', '.join(textcat_labels)}'" - ) - else: - msg.fail( - "Unsupported textcat configuration. Use `spacy debug-data` " - "for more information." - ) - - # fmt: off - row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths) - row_widths = [len(w) for w in row_head] - row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} - # fmt: on - print("") - msg.row(row_head, **row_settings) - msg.row(["-" * width for width in row_settings["widths"]], **row_settings) - try: - iter_since_best = 0 - best_score = 0.0 - for i in range(n_iter): - train_data = corpus.train_dataset( - nlp, - noise_level=noise_level, - orth_variant_level=orth_variant_level, - gold_preproc=gold_preproc, - max_length=0, - ignore_misaligned=True, - ) - if raw_text: - random.shuffle(raw_text) - raw_batches = util.minibatch( - (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 - ) - words_seen = 0 - with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - losses = {} - for batch in util.minibatch_by_words(train_data, size=batch_sizes): - if not batch: - continue - try: - nlp.update( - batch, - sgd=optimizer, - drop=next(dropout_rates), - losses=losses, - ) - except ValueError as e: - err = "Error during training" - if init_tok2vec: - err += " Did you provide the same parameters during 'train' as during 'pretrain'?" - msg.fail(err, f"Original error message: {e}", exits=1) - if raw_text: - # If raw text is available, perform 'rehearsal' updates, - # which use unlabelled data to reduce overfitting. - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) - docs = [ex.doc for ex in batch] - if not int(os.environ.get("LOG_FRIENDLY", 0)): - pbar.update(sum(len(doc) for doc in docs)) - words_seen += sum(len(doc) for doc in docs) - with nlp.use_params(optimizer.averages): - util.set_env_log(False) - epoch_model_path = output_path / f"model{i}" - nlp.to_disk(epoch_model_path) - nlp_loaded = util.load_model_from_path(epoch_model_path) - for beam_width in eval_beam_widths: - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_dataset = list( - corpus.dev_dataset( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, - ) - ) - nwords = sum(len(ex.doc) for ex in dev_dataset) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) - end_time = timer() - if use_gpu < 0: - gpu_wps = None - cpu_wps = nwords / (end_time - start_time) - else: - gpu_wps = nwords / (end_time - start_time) - # Evaluate on CPU in the first iteration only (for - # timing) when GPU is enabled - if i == 0: - with use_ops("numpy"): - nlp_loaded = util.load_model_from_path(epoch_model_path) - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_dataset = list( - corpus.dev_dataset( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, - ) - ) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) - end_time = timer() - cpu_wps = nwords / (end_time - start_time) - acc_loc = output_path / f"model{i}" / "accuracy.json" - srsly.write_json(acc_loc, scorer.scores) - - # Update model meta.json - meta["lang"] = nlp.lang - meta["pipeline"] = nlp.pipe_names - if beam_width == 1: - meta["speed"] = { - "nwords": nwords, - "cpu": cpu_wps, - "gpu": gpu_wps, - } - meta.setdefault("accuracy", {}) - for component in nlp.pipe_names: - for metric in _get_metrics(component): - meta["accuracy"][metric] = scorer.scores[metric] - else: - meta.setdefault("beam_accuracy", {}) - meta.setdefault("beam_speed", {}) - for component in nlp.pipe_names: - for metric in _get_metrics(component): - meta["beam_accuracy"][metric] = scorer.scores[metric] - meta["beam_speed"][beam_width] = { - "nwords": nwords, - "cpu": cpu_wps, - "gpu": gpu_wps, - } - meta["vectors"] = { - "width": nlp.vocab.vectors_length, - "vectors": len(nlp.vocab.vectors), - "keys": nlp.vocab.vectors.n_keys, - "name": nlp.vocab.vectors.name, - } - meta.setdefault("name", f"model{i}") - meta.setdefault("version", version) - meta["labels"] = nlp.meta["labels"] - meta_loc = output_path / f"model{i}" / "meta.json" - srsly.write_json(meta_loc, meta) - util.set_env_log(verbose) - - progress = _get_progress( - i, - losses, - scorer.scores, - output_stats, - beam_width=beam_width if has_beam_widths else None, - cpu_wps=cpu_wps, - gpu_wps=gpu_wps, - ) - if i == 0 and "textcat" in pipeline: - textcats_per_cat = scorer.scores.get("textcats_per_cat", {}) - for cat, cat_score in textcats_per_cat.items(): - if cat_score.get("roc_auc_score", 0) < 0: - msg.warn( - f"Textcat ROC AUC score is undefined due to " - f"only one value in label '{cat}'." - ) - msg.row(progress, **row_settings) - # Early stopping - if n_early_stopping is not None: - current_score = _score_for_model(meta) - if current_score < best_score: - iter_since_best += 1 - else: - iter_since_best = 0 - best_score = current_score - if iter_since_best >= n_early_stopping: - msg.text( - f"Early stopping, best iteration is: {i - iter_since_best}" - ) - msg.text( - f"Best score = {best_score}; Final iteration score = {current_score}" - ) - break - except Exception as e: - msg.warn(f"Aborting and saving final best model. Encountered exception: {e}", exits=1) - finally: - best_pipes = nlp.pipe_names - if disabled_pipes: - disabled_pipes.restore() - with nlp.use_params(optimizer.averages): - final_model_path = output_path / "model-final" - nlp.to_disk(final_model_path) - meta_loc = output_path / "model-final" / "meta.json" - final_meta = srsly.read_json(meta_loc) - final_meta.setdefault("accuracy", {}) - final_meta["accuracy"].update(meta.get("accuracy", {})) - final_meta.setdefault("speed", {}) - final_meta["speed"].setdefault("cpu", None) - final_meta["speed"].setdefault("gpu", None) - meta.setdefault("speed", {}) - meta["speed"].setdefault("cpu", None) - meta["speed"].setdefault("gpu", None) - # combine cpu and gpu speeds with the base model speeds - if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: - speed = _get_total_speed( - [final_meta["speed"]["cpu"], meta["speed"]["cpu"]] - ) - final_meta["speed"]["cpu"] = speed - if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: - speed = _get_total_speed( - [final_meta["speed"]["gpu"], meta["speed"]["gpu"]] - ) - final_meta["speed"]["gpu"] = speed - # if there were no speeds to update, overwrite with meta - if ( - final_meta["speed"]["cpu"] is None - and final_meta["speed"]["gpu"] is None - ): - final_meta["speed"].update(meta["speed"]) - # note: beam speeds are not combined with the base model - if has_beam_widths: - final_meta.setdefault("beam_accuracy", {}) - final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {})) - final_meta.setdefault("beam_speed", {}) - final_meta["beam_speed"].update(meta.get("beam_speed", {})) - srsly.write_json(meta_loc, final_meta) - msg.good("Saved model to output directory", final_model_path) - with msg.loading("Creating best model..."): - best_model_path = _collate_best_model(final_meta, output_path, best_pipes) - msg.good("Created best model", best_model_path) - - -def _score_for_model(meta): - """ Returns mean score between tasks in pipeline that can be used for early stopping. """ - mean_acc = list() - pipes = meta["pipeline"] - acc = meta["accuracy"] - if "tagger" in pipes: - mean_acc.append(acc["tags_acc"]) - if "morphologizer" in pipes: - mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2) - if "parser" in pipes: - mean_acc.append((acc["uas"] + acc["las"]) / 2) - if "ner" in pipes: - mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) - if "textcat" in pipes: - mean_acc.append(acc["textcat_score"]) - if "senter" in pipes: - mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3) - return sum(mean_acc) / len(mean_acc) - - -@contextlib.contextmanager -def _create_progress_bar(total): - if int(os.environ.get("LOG_FRIENDLY", 0)): - yield - else: - pbar = tqdm.tqdm(total=total, leave=False) - yield pbar - - -def _load_vectors(nlp, vectors): - util.load_model(vectors, vocab=nlp.vocab) - - -def _load_pretrained_tok2vec(nlp, loc): - """Load pretrained weights for the 'token-to-vector' part of the component - models, which is typically a CNN. See 'spacy pretrain'. Experimental. - """ - with loc.open("rb") as file_: - weights_data = file_.read() - loaded = [] - for name, component in nlp.pipeline: - if hasattr(component, "model") and component.model.has_ref("tok2vec"): - component.get_ref("tok2vec").from_bytes(weights_data) - loaded.append(name) - return loaded - - -def _collate_best_model(meta, output_path, components): - bests = {} - meta.setdefault("accuracy", {}) - for component in components: - bests[component] = _find_best(output_path, component) - best_dest = output_path / "model-best" - shutil.copytree(str(output_path / "model-final"), str(best_dest)) - for component, best_component_src in bests.items(): - shutil.rmtree(str(best_dest / component)) - shutil.copytree(str(best_component_src / component), str(best_dest / component)) - accs = srsly.read_json(best_component_src / "accuracy.json") - for metric in _get_metrics(component): - meta["accuracy"][metric] = accs[metric] - srsly.write_json(best_dest / "meta.json", meta) - return best_dest - - -def _find_best(experiment_dir, component): - accuracies = [] - for epoch_model in experiment_dir.iterdir(): - if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": - accs = srsly.read_json(epoch_model / "accuracy.json") - scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] - # remove per_type dicts from score list for max() comparison - scores = [score for score in scores if isinstance(score, float)] - accuracies.append((scores, epoch_model)) - if accuracies: - return max(accuracies)[1] - else: - return None - - -def _get_metrics(component): - if component == "parser": - return ("las", "uas", "las_per_type", "sent_f", "token_acc") - elif component == "tagger": - return ("tags_acc", "token_acc") - elif component == "morphologizer": - return ("morphs_acc", "pos_acc", "token_acc") - elif component == "ner": - return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc") - elif component == "senter": - return ("sent_f", "sent_p", "sent_r", "token_acc") - elif component == "textcat": - return ("textcat_score", "token_acc") - return ("token_acc",) - - -def _configure_training_output(pipeline, use_gpu, has_beam_widths): - row_head = ["Itn"] - output_stats = [] - for pipe in pipeline: - if pipe == "tagger": - row_head.extend(["Tag Loss ", " Tag % "]) - output_stats.extend(["tag_loss", "tags_acc"]) - elif pipe == "morphologizer" or pipe == "morphologizertagger": - row_head.extend(["Morph Loss ", " Morph % ", " POS % "]) - output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"]) - elif pipe == "parser": - row_head.extend( - ["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"] - ) - output_stats.extend( - ["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"] - ) - elif pipe == "ner": - row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "]) - output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"]) - elif pipe == "textcat": - row_head.extend(["Textcat Loss", "Textcat"]) - output_stats.extend(["textcat_loss", "textcat_score"]) - elif pipe == "senter": - row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"]) - output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"]) - row_head.extend(["Token %", "CPU WPS"]) - output_stats.extend(["token_acc", "cpu_wps"]) - - if use_gpu >= 0: - row_head.extend(["GPU WPS"]) - output_stats.extend(["gpu_wps"]) - - if has_beam_widths: - row_head.insert(1, "Beam W.") - # remove duplicates - row_head_dict = {k: 1 for k in row_head} - output_stats_dict = {k: 1 for k in output_stats} - return row_head_dict.keys(), output_stats_dict.keys() - - -def _get_progress( - itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0 -): - scores = {} - for stat in output_stats: - scores[stat] = 0.0 - scores["dep_loss"] = losses.get("parser", 0.0) - scores["ner_loss"] = losses.get("ner", 0.0) - scores["tag_loss"] = losses.get("tagger", 0.0) - scores["morph_loss"] = losses.get("morphologizer", 0.0) - scores["textcat_loss"] = losses.get("textcat", 0.0) - scores["senter_loss"] = losses.get("senter", 0.0) - scores["cpu_wps"] = cpu_wps - scores["gpu_wps"] = gpu_wps or 0.0 - scores.update(dev_scores) - formatted_scores = [] - for stat in output_stats: - format_spec = "{:.3f}" - if stat.endswith("_wps"): - format_spec = "{:.0f}" - formatted_scores.append(format_spec.format(scores[stat])) - result = [itn + 1] - result.extend(formatted_scores) - if beam_width is not None: - result.insert(1, beam_width) - return result - - -def _get_total_speed(speeds): - seconds_per_word = 0.0 - for words_per_second in speeds: - if words_per_second is None: - return None - seconds_per_word += 1.0 / words_per_second - return 1.0 / seconds_per_word diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index a6d0a0abc..ec099b294 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,5 +1,7 @@ from typing import Optional, Dict, List, Union, Sequence from timeit import default_timer as timer + +import srsly from pydantic import BaseModel, FilePath import plac import tqdm @@ -11,9 +13,10 @@ from thinc.api import Model, use_pytorch_for_gpu_memory import random from ..gold import GoldCorpus +from ..lookups import Lookups from .. import util from ..errors import Errors -from ..ml import models # don't remove - required to load the built-in architectures +from ..ml import models # don't remove - required to load the built-in architectures registry = util.registry @@ -23,7 +26,6 @@ patience = 10 eval_frequency = 10 dropout = 0.2 init_tok2vec = null -vectors = null max_epochs = 100 orth_variant_level = 0.0 gold_preproc = false @@ -47,7 +49,7 @@ beta2 = 0.999 [nlp] lang = "en" -vectors = ${training:vectors} +vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" @@ -93,7 +95,6 @@ class ConfigSchema(BaseModel): eval_frequency: int = 100 dropout: float = 0.2 init_tok2vec: Optional[FilePath] = None - vectors: Optional[str] = None max_epochs: int = 100 orth_variant_level: float = 0.0 gold_preproc: bool = False @@ -119,9 +120,14 @@ class ConfigSchema(BaseModel): dev_path=("Location of JSON-formatted development data", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), output_path=("Output directory to store model in", "option", "o", Path), - meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), + init_tok2vec=( + "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", + Path), raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), + verbose=("Display more information for debugging purposes", "flag", "VV", bool), use_gpu=("Use GPU", "option", "g", int), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), # fmt: on ) def train_cli( @@ -129,30 +135,53 @@ def train_cli( dev_path, config_path, output_path=None, - meta_path=None, + init_tok2vec=None, raw_text=None, - debug=False, verbose=False, use_gpu=-1, + tag_map_path=None, + omit_extra_lookups=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ + util.set_env_log(verbose) + + # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) - if meta_path is not None and not meta_path.exists(): - msg.fail("Can't find model meta.json", meta_path, exits=1) if output_path is not None and not output_path.exists(): output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.warn( + "Output directory is not empty.", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) + if raw_text is not None: + raw_text = list(srsly.read_jsonl(raw_text)) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + + weights_data = None + if init_tok2vec is not None: + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() if use_gpu >= 0: - msg.info("Using GPU") + msg.info("Using GPU: {use_gpu}") util.use_gpu(use_gpu) else: msg.info("Using CPU") @@ -161,13 +190,21 @@ def train_cli( config_path, {"train": train_path, "dev": dev_path}, output_path=output_path, - meta_path=meta_path, raw_text=raw_text, + tag_map=tag_map, + weights_data=weights_data, + omit_extra_lookups=omit_extra_lookups, ) def train( - config_path, data_paths, raw_text=None, meta_path=None, output_path=None, + config_path, + data_paths, + raw_text=None, + output_path=None, + tag_map=None, + weights_data=None, + omit_extra_lookups=False, ): msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config @@ -177,15 +214,104 @@ def train( use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) + training = config["training"] msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) - training = config["training"] optimizer = training["optimizer"] limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) - msg.info("Initializing the nlp pipeline") - nlp.begin_training(lambda: corpus.train_examples) + + # verify textcat config + if "textcat" in nlp_config["pipeline"]: + textcat_labels = set(nlp.get_pipe("textcat").labels) + textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] + + # check whether the setting 'exclusive_classes' corresponds to the provided training data + if textcat_multilabel: + multilabel_found = False + for ex in corpus.train_examples: + cats = ex.doc_annotation.cats + textcat_labels.update(cats.keys()) + if list(cats.values()).count(1.0) != 1: + multilabel_found = True + if not multilabel_found: + msg.warn( + "The textcat training instances look like they have " + "mutually exclusive classes. Set 'exclusive_classes' " + "to 'true' in the config to train a classifier with " + "mutually exclusive classes more accurately." + ) + else: + for ex in corpus.train_examples: + cats = ex.doc_annotation.cats + textcat_labels.update(cats.keys()) + if list(cats.values()).count(1.0) != 1: + msg.fail( + "Some textcat training instances do not have exactly " + "one positive label. Set 'exclusive_classes' " + "to 'false' in the config to train a classifier with classes " + "that are not mutually exclusive." + ) + msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") + nlp.get_pipe("textcat").labels = tuple(textcat_labels) + + # if 'positive_label' is provided: double check whether it's in the data and the task is binary + if nlp_config["pipeline"]["textcat"].get("positive_label", None): + textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) + pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] + if pos_label not in textcat_labels: + msg.fail( + f"The textcat's 'positive_label' config setting '{pos_label}' " + f"does not match any label in the training data.", + exits=1, + ) + if len(textcat_labels) != 2: + msg.fail( + f"A textcat 'positive_label' '{pos_label}' was " + f"provided for training data that does not appear to be a " + f"binary classification problem with two labels.", + exits=1, + ) + + if training.get("resume", False): + msg.info("Resuming training") + nlp.resume_training() + else: + msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") + nlp.begin_training( + lambda: corpus.train_examples + ) + + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) + + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + + # Load a pretrained tok2vec model - cf. CLI command 'pretrain' + if weights_data is not None: + tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None) + if tok2vec_path is None: + msg.fail( + f"To use a pretrained tok2vec model, the config needs to specify which " + f"tok2vec layer to load in the setting [pretraining.tok2vec_model].", + exits=1, + ) + tok2vec = config + for subpath in tok2vec_path.split("."): + tok2vec = tok2vec.get(subpath) + if not tok2vec: + msg.fail( + f"Could not locate the tok2vec model at {tok2vec_path}.", + exits=1, + ) + tok2vec.from_bytes(weights_data) train_batches = create_train_batches(nlp, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) @@ -202,6 +328,7 @@ def train( patience=training.get("patience", 0), max_steps=training.get("max_steps", 0), eval_frequency=training["eval_frequency"], + raw_text=raw_text, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") @@ -215,7 +342,8 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - nlp.to_disk(output_path) + update_meta(training, nlp, info) + nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) # Clean up the objects to faciliate garbage collection. for eg in batch: @@ -223,6 +351,12 @@ def train( eg.goldparse = None eg.doc_annotation = None eg.token_annotation = None + except Exception as e: + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}", + exits=1, + ) finally: if output_path is not None: final_model_path = output_path / "model-final" @@ -231,24 +365,30 @@ def train( nlp.to_disk(final_model_path) else: nlp.to_disk(final_model_path) - msg.good("Saved model to output directory", final_model_path) + msg.good(f"Saved model to output directory {final_model_path}") def create_train_batches(nlp, corpus, cfg): epochs_todo = cfg.get("max_epochs", 0) while True: - train_examples = list(corpus.train_dataset( - nlp, - noise_level=0.0, - orth_variant_level=cfg["orth_variant_level"], - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"], - ignore_misaligned=True, - )) + train_examples = list( + corpus.train_dataset( + nlp, + noise_level=cfg["noise_level"], + orth_variant_level=cfg["orth_variant_level"], + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"], + ignore_misaligned=True, + ) + ) if len(train_examples) == 0: raise ValueError(Errors.E988) random.shuffle(train_examples) - batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"]) + batches = util.minibatch_by_words( + train_examples, + size=cfg["batch_size"], + discard_oversize=cfg["discard_oversize"], + ) # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop try: first = next(batches) @@ -273,7 +413,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): ) n_words = sum(len(ex.doc) for ex in dev_examples) start_time = timer() - + if optimizer.averages: with nlp.use_params(optimizer.averages): scorer = nlp.evaluate(dev_examples, batch_size=32) @@ -284,7 +424,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): scores = scorer.scores # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] - weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + try: + weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + except KeyError as e: + raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) + scores["speed"] = wps return weighted_score, scores @@ -292,8 +436,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): def train_while_improving( - nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency, - accumulate_gradient=1, patience=0, max_steps=0 + nlp, + optimizer, + train_data, + evaluate, + *, + dropout, + eval_frequency, + accumulate_gradient=1, + patience=0, + max_steps=0, + raw_text=None, ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -341,11 +494,22 @@ def train_while_improving( losses = {} to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")] + if raw_text: + random.shuffle(raw_text) + raw_batches = util.minibatch( + (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 + ) + for step, batch in enumerate(train_data): dropout = next(dropouts) with nlp.select_pipes(enable=to_enable): for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) + if raw_text: + # If raw text is available, perform 'rehearsal' updates, + # which use unlabelled data to reduce overfitting. + raw_batch = list(next(raw_batches)) + nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) for name, proc in nlp.pipeline: if hasattr(proc, "model"): proc.model.finish_update(optimizer) @@ -386,7 +550,7 @@ def subdivide_batch(batch, accumulate_gradient): if subbatch: yield subbatch start += len(subbatch) - subbatch = batch[start : ] + subbatch = batch[start:] if subbatch: yield subbatch @@ -405,14 +569,34 @@ def setup_printer(training, nlp): msg.row(["-" * width for width in table_widths]) def print_row(info): - losses = [ - "{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0))) - for pipe_name in nlp.pipe_names - ] - scores = [ - "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols - ] - data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + try: + losses = [ + "{0:.2f}".format(float(info["losses"][pipe_name])) + for pipe_name in nlp.pipe_names + ] + except KeyError as e: + raise KeyError( + Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + + try: + scores = [ + "{0:.2f}".format(float(info["other_scores"][col])) + for col in score_cols + ] + except KeyError as e: + raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + data = ( + [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + ) msg.row(data, widths=table_widths, aligns=table_aligns) return print_row + + +def update_meta(training, nlp, info): + score_cols = training["scores"] + nlp.meta["performance"] = {} + for metric in score_cols: + nlp.meta["performance"][metric] = info["other_scores"][metric] + for pipe_name in nlp.pipe_names: + nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/errors.py b/spacy/errors.py index 94a0218a7..d6fdd1b43 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -580,7 +580,14 @@ class Errors(object): "table, which contains {n_rows} vectors.") # TODO: fix numbering after merging develop into master - + E983 = ("Invalid key for '{dict_name}': {key}. Available keys: " + "{keys}") + E984 = ("Could not parse the {input} - double check the data is written " + "in the correct format as expected by spaCy.") + E985 = ("The pipeline component '{component}' is already available in the base " + "model. The settings in the component block in the config file are " + "being ignored. If you want to replace this component instead, set " + "'replace' to True in the training configuration.") E986 = ("Could not create any training batches: check your input. " "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1e58f0635..19b135193 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -229,6 +229,10 @@ class GoldCorpus(object): if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)): raise ValueError(Errors.E987.format(type=type(doc))) examples.append(Example.from_dict(ex_dict, doc=doc)) + else: + raise ValueError(Errors.E984.format(input="JSONL format")) + else: + raise ValueError(Errors.E984.format(input="JSONL format")) elif file_name.endswith("msg"): text, ex_dict = srsly.read_msgpack(loc) @@ -550,14 +554,22 @@ def json_to_examples(doc): def read_json_file(loc, docs_filter=None, limit=None): loc = util.ensure_path(loc) if loc.is_dir(): + parsed = False for filename in loc.iterdir(): + parsed = True yield from read_json_file(loc / filename, limit=limit) + if not parsed: + raise ValueError(Errors.E984.format(input="JSON directory")) else: + parsed = False for doc in _json_iterate(loc): if docs_filter is not None and not docs_filter(doc): continue for json_data in json_to_examples(doc): + parsed = True yield json_data + if not parsed: + raise ValueError(Errors.E984.format(input="JSON file")) def _json_iterate(loc): diff --git a/spacy/language.py b/spacy/language.py index 6341dc858..97bdd698c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -319,14 +319,14 @@ class Language(object): # transform the model's config to an actual Model factory_cfg = dict(config) - # check whether we have a proper model config, or load a default one + # check whether we have a proper model config, ignore if the type is wrong if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): warnings.warn( Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name) ) # refer to the model configuration in the cfg settings for this component - if "model" in factory_cfg: + elif "model" in factory_cfg: self.config[name] = {"model": factory_cfg["model"]} # create all objects in the config @@ -1086,6 +1086,7 @@ class component(object): requires=tuple(), retokenizes=False, default_model=lambda: None, + default_config=None, ): """Decorate a pipeline component. @@ -1099,6 +1100,7 @@ class component(object): self.requires = validate_attrs(requires) self.retokenizes = retokenizes self.default_model = default_model + self.default_config = default_config def __call__(self, *args, **kwargs): obj = args[0] @@ -1113,9 +1115,10 @@ class component(object): def factory(nlp, model, **cfg): if model is None: model = self.default_model() - warnings.warn(Warnings.W098.format(name=self.name)) - if model is None: - warnings.warn(Warnings.W097.format(name=self.name)) + if self.default_config: + for key, value in self.default_config.items(): + if key not in cfg: + cfg[key] = value if hasattr(obj, "from_nlp"): return obj.from_nlp(nlp, model, **cfg) elif isinstance(obj, type): diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 8000d1aff..4a360a9e6 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -3,26 +3,31 @@ import numpy from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model -def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): +def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): + softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, - Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0), + Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), LayerNorm(token_vector_width * 2), - Softmax(nO=n_tags, nI=token_vector_width * 2), + softmax, ) + model.set_ref("tok2vec", tok2vec) + model.set_ref("output_layer", softmax) return model -def build_cloze_multi_task_model(vocab, tok2vec): - output_size = vocab.vectors.data.shape[1] +def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): + # nO = vocab.vectors.data.shape[1] output_layer = chain( Maxout( - nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0 + nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 ), - Linear(nO=output_size, nI=output_size, init_W=zero_init), + Linear(nO=nO, nI=nO, init_W=zero_init), ) model = chain(tok2vec, output_layer) model = build_masked_language_model(vocab, model) + model.set_ref("tok2vec", tok2vec) + model.set_ref("output_layer", output_layer) return model diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 141c66f79..a02e1a5a1 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -31,6 +31,7 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) + model.attrs["multi_label"] = not exclusive_classes return model @@ -44,6 +45,7 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO output_layer = softmax_activation() if exclusive_classes else Logistic() model = model >> with_cpu(output_layer, output_layer.ops) model.set_ref("output_layer", sparse_linear) + model.attrs["multi_label"] = not exclusive_classes return model @@ -110,6 +112,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class if model.has_dim("nO") is not False: model.set_dim("nO", nO) model.set_ref("output_layer", linear_model.get_ref("output_layer")) + model.attrs["multi_label"] = not exclusive_classes return model diff --git a/spacy/pipeline/defaults/multitask_defaults.cfg b/spacy/pipeline/defaults/multitask_defaults.cfg new file mode 100644 index 000000000..d3dbe9b53 --- /dev/null +++ b/spacy/pipeline/defaults/multitask_defaults.cfg @@ -0,0 +1,15 @@ +[model] +@architectures = "spacy.MultiTask.v1" +maxout_pieces = 3 +token_vector_width = 96 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 2 +subword_features = true +dropout = null diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a6edf00d9..75628ce3c 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -648,9 +648,10 @@ class MultitaskObjective(Tagger): side-objective. """ - def __init__(self, vocab, model, target='dep_tag_offset', **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model + target = cfg["target"] # default: 'dep_tag_offset' if target == "dep": self.make_label = self.make_dep elif target == "tag": @@ -668,8 +669,6 @@ class MultitaskObjective(Tagger): else: raise ValueError(Errors.E016) self.cfg = dict(cfg) - # TODO: remove - put in config - self.cfg.setdefault("maxout_pieces", 2) @property def labels(self): @@ -682,7 +681,7 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: @@ -808,13 +807,13 @@ class ClozeMultitask(Pipe): self.vocab = vocab self.model = model self.cfg = cfg - self.distance = CosineDistance(ignore_zeros=True, normalize=False) + self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config def set_annotations(self, docs, dep_ids, tensors=None): pass def begin_training(self, get_examples=lambda: [], pipeline=None, - tok2vec=None, sgd=None, **kwargs): + sgd=None, **kwargs): link_vectors_to_models(self.vocab) self.model.initialize() X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) @@ -951,13 +950,13 @@ class TextCategorizer(Pipe): losses[self.name] += (gradient**2).sum() def _examples_to_truth(self, examples): - golds = [ex.gold for ex in examples] - truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") - not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") - for i, gold in enumerate(golds): + gold_cats = [ex.doc_annotation.cats for ex in examples] + truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f") + not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f") + for i, gold_cat in enumerate(gold_cats): for j, label in enumerate(self.labels): - if label in gold.cats: - truths[i, j] = gold.cats[label] + if label in gold_cat: + truths[i, j] = gold_cat[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) @@ -1026,28 +1025,27 @@ cdef class DependencyParser(Parser): output.append(merge_subtokens) return tuple(output) - def add_multitask_objective(self, target): - if target == "cloze": - cloze = ClozeMultitask(self.vocab) - self._multitasks.append(cloze) - else: - labeller = MultitaskObjective(self.vocab, target=target) - self._multitasks.append(labeller) + def add_multitask_objective(self, mt_component): + self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: - tok2vec = self.model.get_ref("tok2vec") - labeller.begin_training(get_examples, pipeline=pipeline, - tok2vec=tok2vec, sgd=sgd) + labeller.model.set_dim("nO", len(self.labels)) + if labeller.model.has_ref("output_layer"): + labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) + labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) def __reduce__(self): - return (DependencyParser, (self.vocab, self.model), self.moves) + return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg)) def __getstate__(self): - return self.moves + return (self.moves, self.cfg) - def __setstate__(self, moves): + def __setstate__(self, state): + moves, config = state self.moves = moves + self.cfg = config @property def labels(self): @@ -1073,28 +1071,27 @@ cdef class EntityRecognizer(Parser): requires = [] TransitionSystem = BiluoPushDown - def add_multitask_objective(self, target): - if target == "cloze": - cloze = ClozeMultitask(self.vocab) - self._multitasks.append(cloze) - else: - labeller = MultitaskObjective(self.vocab, target=target) - self._multitasks.append(labeller) + def add_multitask_objective(self, mt_component): + self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: - tok2vec = self.model.get_ref("tok2vec") - labeller.begin_training(get_examples, pipeline=pipeline, - tok2vec=tok2vec) + labeller.model.set_dim("nO", len(self.labels)) + if labeller.model.has_ref("output_layer"): + labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) + labeller.begin_training(get_examples, pipeline=pipeline) def __reduce__(self): - return (EntityRecognizer, (self.vocab, self.model), self.moves) + return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg)) def __getstate__(self): - return self.moves + return self.moves, self.cfg - def __setstate__(self, moves): + def __setstate__(self, state): + moves, config = state self.moves = moves + self.cfg = config @property def labels(self): @@ -1565,15 +1562,23 @@ Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, mod Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg) def parser_factory(nlp, model, **cfg): + default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} if model is None: model = default_parser() warnings.warn(Warnings.W098.format(name="parser")) + for key, value in default_config.items(): + if key not in cfg: + cfg[key] = value return DependencyParser.from_nlp(nlp, model, **cfg) def ner_factory(nlp, model, **cfg): + default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} if model is None: model = default_ner() warnings.warn(Warnings.W098.format(name="ner")) + for key, value in default_config.items(): + if key not in cfg: + cfg[key] = value return EntityRecognizer.from_nlp(nlp, model, **cfg) __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5882fa266..de30a55f0 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -172,7 +172,7 @@ class Tok2VecListener(Model): def verify_inputs(self, inputs): if self._batch_id is None and self._outputs is None: - raise ValueError + raise ValueError("The Tok2Vec listener did not receive valid input.") else: batch_id = self.get_batch_id(inputs) if batch_id != self._batch_id: diff --git a/spacy/scorer.py b/spacy/scorer.py index 7e2466be7..288da23aa 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -88,24 +88,20 @@ class Scorer(object): self.ner = PRFScore() self.ner_per_ents = dict() self.eval_punct = eval_punct - self.textcat = None - self.textcat_per_cat = dict() + self.textcat = PRFScore() + self.textcat_f_per_cat = dict() + self.textcat_auc_per_cat = dict() self.textcat_positive_label = None self.textcat_multilabel = False if pipeline: - for name, model in pipeline: + for name, component in pipeline: if name == "textcat": - self.textcat_positive_label = model.cfg.get("positive_label", None) - if self.textcat_positive_label: - self.textcat = PRFScore() - if not model.cfg.get("exclusive_classes", False): - self.textcat_multilabel = True - for label in model.cfg.get("labels", []): - self.textcat_per_cat[label] = ROCAUCScore() - else: - for label in model.cfg.get("labels", []): - self.textcat_per_cat[label] = PRFScore() + self.textcat_multilabel = component.model.attrs["multi_label"] + self.textcat_positive_label = component.cfg.get("positive_label", None) + for label in component.cfg.get("labels", []): + self.textcat_auc_per_cat[label] = ROCAUCScore() + self.textcat_f_per_cat[label] = PRFScore() @property def tags_acc(self): @@ -207,46 +203,52 @@ class Scorer(object): } @property - def textcat_score(self): - """RETURNS (float): f-score on positive label for binary exclusive, - macro-averaged f-score for 3+ exclusive, - macro-averaged AUC ROC score for multilabel (-1 if undefined) + def textcat_f(self): + """RETURNS (float): f-score on positive label for binary classification, + macro-averaged f-score for multilabel classification """ if not self.textcat_multilabel: - # binary multiclass if self.textcat_positive_label: + # binary classification return self.textcat.fscore * 100 - # other multiclass - return ( - sum([score.fscore for label, score in self.textcat_per_cat.items()]) - / (len(self.textcat_per_cat) + 1e-100) - * 100 - ) - # multilabel + # multi-class and/or multi-label + return ( + sum([score.fscore for label, score in self.textcat_f_per_cat.items()]) + / (len(self.textcat_f_per_cat) + 1e-100) + * 100 + ) + + @property + def textcat_auc(self): + """RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined) + """ return max( - sum([score.score for label, score in self.textcat_per_cat.items()]) - / (len(self.textcat_per_cat) + 1e-100), + sum([score.score for label, score in self.textcat_auc_per_cat.items()]) + / (len(self.textcat_auc_per_cat) + 1e-100), -1, ) @property - def textcats_per_cat(self): - """RETURNS (dict): Scores per textcat label. + def textcats_auc_per_cat(self): + """RETURNS (dict): AUC ROC Scores per textcat label. """ - if not self.textcat_multilabel: - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.textcat_per_cat.items() - } return { k: {"roc_auc_score": max(v.score, -1)} - for k, v in self.textcat_per_cat.items() + for k, v in self.textcat_auc_per_cat.items() + } + + @property + def textcats_f_per_cat(self): + """RETURNS (dict): F-scores per textcat label. + """ + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.textcat_f_per_cat.items() } @property def scores(self): - """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`, - `ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`. + """RETURNS (dict): All scores mapped by key. """ return { "uas": self.uas, @@ -264,8 +266,10 @@ class Scorer(object): "sent_r": self.sent_r, "sent_f": self.sent_f, "token_acc": self.token_acc, - "textcat_score": self.textcat_score, - "textcats_per_cat": self.textcats_per_cat, + "textcat_f": self.textcat_f, + "textcat_auc": self.textcat_auc, + "textcats_f_per_cat": self.textcats_f_per_cat, + "textcats_auc_per_cat": self.textcats_auc_per_cat, } def score(self, example, verbose=False, punct_labels=("p", "punct")): @@ -408,7 +412,7 @@ class Scorer(object): ) if ( len(gold.cats) > 0 - and set(self.textcat_per_cat) == set(gold.cats) + and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) and set(gold.cats) == set(doc.cats) ): goldcat = max(gold.cats, key=gold.cats.get) @@ -418,17 +422,21 @@ class Scorer(object): set([self.textcat_positive_label]) & set([candcat]), set([self.textcat_positive_label]) & set([goldcat]), ) - for label in self.textcat_per_cat: - if self.textcat_multilabel: - self.textcat_per_cat[label].score_set( + for label in set(gold.cats): + self.textcat_auc_per_cat[label].score_set( doc.cats[label], gold.cats[label] - ) - else: - self.textcat_per_cat[label].score_set( + ) + self.textcat_f_per_cat[label].score_set( set([label]) & set([candcat]), set([label]) & set([goldcat]) - ) - elif len(self.textcat_per_cat) > 0: - model_labels = set(self.textcat_per_cat) + ) + elif len(self.textcat_f_per_cat) > 0: + model_labels = set(self.textcat_f_per_cat) + eval_labels = set(gold.cats) + raise ValueError( + Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) + ) + elif len(self.textcat_auc_per_cat) > 0: + model_labels = set(self.textcat_auc_per_cat) eval_labels = set(gold.cats) raise ValueError( Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fcaff444e..7bd9562e2 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -63,15 +63,14 @@ cdef class Parser: # defined by EntityRecognizer as a BiluoPushDown moves = self.TransitionSystem(self.vocab.strings) self.moves = moves - cfg.setdefault('min_action_freq', 30) - cfg.setdefault('learn_tokens', False) - cfg.setdefault('beam_width', 1) - cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used) self.model = model if self.moves.n_moves != 0: self.set_output(self.moves.n_moves) self.cfg = cfg self._multitasks = [] + for multitask in cfg.get("multitasks", []): + self.add_multitask_objective(multitask) + self._rehearsal_model = None @classmethod @@ -79,13 +78,15 @@ cdef class Parser: return cls(nlp.vocab, model, **cfg) def __reduce__(self): - return (Parser, (self.vocab, self.model), self.moves) + return (Parser, (self.vocab, self.model), (self.moves, self.cfg)) def __getstate__(self): - return self.moves + return (self.moves, self.cfg) - def __setstate__(self, moves): + def __setstate__(self, state): + moves, config = state self.moves = moves + self.cfg = config @property def move_names(self): diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index c92fc1ff9..879334056 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,8 @@ from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 @@ -25,7 +26,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index ee1bba886..f9663ba32 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -17,7 +17,8 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(vocab, default_parser(), **config) return parser @@ -57,12 +58,13 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - ner1 = EntityRecognizer(Vocab(), default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.begin_training([]) - ner2 = EntityRecognizer(Vocab(), default_ner()) + ner2 = EntityRecognizer(Vocab(), default_ner(), **config) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 30b4a6f6d..5d265261f 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -138,7 +138,8 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - parser = DependencyParser(doc.vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(doc.vocab, default_parser(), **config) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8e41a16c0..b0a8109dc 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -138,7 +138,8 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - ner1 = EntityRecognizer(doc1.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -156,7 +157,8 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - ner2 = EntityRecognizer(doc2.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) # set "New York" to a blocked entity doc2.ents = [(0, 3, 5)] @@ -213,7 +215,8 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - ner2 = EntityRecognizer(doc.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2.moves.add_action(5, "") ner2.add_label("GPE") state = ner2.moves.init_batch([doc])[0] diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index b648e9a00..7f3e981ea 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -28,7 +28,8 @@ def tok2vec(): @pytest.fixture def parser(vocab, arc_eager): - return Parser(vocab, model=default_parser(), moves=arc_eager) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + return Parser(vocab, model=default_parser(), moves=arc_eager, **config) @pytest.fixture diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index db9eb5e6f..fa5d59f9e 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -94,7 +94,8 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser") + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index dc13fcdf1..ccf7d3ba3 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -16,7 +16,8 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(vocab, default_parser(), **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 5a76697bc..177b6bb3d 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,8 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - ner = EntityRecognizer(Vocab(), default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example(doc=None) example.set_token_annotation( ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 9ff118a1f..6df437b3c 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -196,7 +196,8 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - ner = EntityRecognizer(doc.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(doc.vocab, default_ner(), **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 3d8e80847..15632bdf8 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -6,7 +6,8 @@ from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - parser = DependencyParser(Vocab(), default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) @@ -15,7 +16,8 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True) + config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 30081543b..4978aba44 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -74,6 +74,7 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - ner2 = EntityRecognizer(vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index ba4d2deab..946316d85 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -12,7 +12,8 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - ner = EntityRecognizer(nlp.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner.add_label("SOME_LABEL") ner.begin_training([]) nlp.add_pipe(ner) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index 967db5d67..cdc3c09ca 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -1,12 +1,30 @@ -import pytest +import pickle import numpy from spacy.lang.en import English from spacy.vocab import Vocab +from spacy.tests.util import make_tempdir + + +def test_pickle_ner(): + """ Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["min_action_freq"] == 342 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["min_action_freq"] == 342 + def test_issue4725(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) vocab = Vocab(vectors_name="test_vocab_add_vector") data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 595a35a9f..9c4e1f61e 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -12,7 +12,8 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - parser = DependencyParser(en_vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(en_vocab, default_parser(), **config) parser.add_label("nsubj") return parser diff --git a/spacy/util.py b/spacy/util.py index bc6c98a82..d2d87bef9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -186,7 +186,7 @@ def load_model_from_path(model_path, meta=False, **overrides): return nlp.from_disk(model_path, exclude=disable) -def load_model_from_config(nlp_config): +def load_model_from_config(nlp_config, replace=False): if "name" in nlp_config: nlp = load_model(**nlp_config) elif "lang" in nlp_config: @@ -197,8 +197,15 @@ def load_model_from_config(nlp_config): if "pipeline" in nlp_config: for name, component_cfg in nlp_config["pipeline"].items(): factory = component_cfg.pop("factory") - component = nlp.create_pipe(factory, config=component_cfg) - nlp.add_pipe(component, name=name) + if name in nlp.pipe_names: + if replace: + component = nlp.create_pipe(factory, config=component_cfg) + nlp.replace_pipe(name, component) + else: + raise ValueError(Errors.E985.format(component=name)) + else: + component = nlp.create_pipe(factory, config=component_cfg) + nlp.add_pipe(component, name=name) return nlp diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index b1824573c..180665929 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -46,17 +46,19 @@ Update the evaluation scores from a single [`Doc`](/api/doc) / ## Properties -| Name | Type | Description | -| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `token_acc` | float | Tokenization accuracy. | -| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | -| `uas` | float | Unlabelled dependency score. | -| `las` | float | Labelled dependency score. | -| `ents_p` | float | Named entity accuracy (precision). | -| `ents_r` | float | Named entity accuracy (recall). | -| `ents_f` | float | Named entity accuracy (F-score). | -| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | -| `textcat_score` 2.2 | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). | -| `textcats_per_cat` 2.2 | dict | Scores per textcat label, keyed by label. | -| `las_per_type` 2.2.3 | dict | Labelled dependency scores, keyed by label. | -| `scores` | dict | All scores, keyed by type. | +| Name | Type | Description | +| --------------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- | +| `token_acc` | float | Tokenization accuracy. | +| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | +| `uas` | float | Unlabelled dependency score. | +| `las` | float | Labelled dependency score. | +| `ents_p` | float | Named entity accuracy (precision). | +| `ents_r` | float | Named entity accuracy (recall). | +| `ents_f` | float | Named entity accuracy (F-score). | +| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | +| `textcat_f` 3.0 | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. | +| `textcat_auc` | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined). | +| `textcats_f_per_cat` 3.0 | dict | F-scores per textcat label, keyed by label. | +| `textcats_auc_per_cat` 3.0 | dict | ROC AUC scores per textcat label, keyed by label. | +| `las_per_type` 2.2.3 | dict | Labelled dependency scores, keyed by label. | +| `scores` | dict | All scores, keyed by type. | From a1c5b694be117ac92e21f9860309821ad6da06f7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jun 2020 02:22:13 +0200 Subject: [PATCH 185/187] Small fixes to train defaults --- spacy/cli/train_from_config.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index ec099b294..f24feffab 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -156,17 +156,18 @@ def train_cli( msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) - if output_path is not None and not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") - elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty.", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) + if output_path is not None: + if not output_path.exists(): + output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.warn( + "Output directory is not empty.", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) tag_map = {} @@ -210,7 +211,8 @@ def train( # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) - if config["training"]["use_pytorch_for_gpu_memory"]: + if config["training"].get("use_pytorch_for_gpu_memory"): + # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) @@ -374,7 +376,7 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=cfg["noise_level"], + noise_level=0.0, # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], From 8283df80e91d7fba385b12c42eb976ab30ca1e2a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 20 Jun 2020 14:15:04 +0200 Subject: [PATCH 186/187] Tidy up and auto-format --- spacy/cli/pretrain.py | 4 +- spacy/cli/train_from_config.py | 86 ++++++++++--------- spacy/lemmatizer.py | 9 +- spacy/ml/__init__.py | 2 +- spacy/ml/_biluo.py | 23 +++-- spacy/ml/_iob.py | 22 +++-- spacy/ml/_precomputable_affine.py | 2 +- spacy/ml/models/__init__.py | 2 +- spacy/ml/models/multi_task.py | 17 +++- spacy/ml/models/parser.py | 8 +- spacy/ml/models/simple_ner.py | 29 ++++--- spacy/ml/models/tagger.py | 3 +- spacy/ml/models/textcat.py | 76 ++++++++++------ spacy/ml/models/tok2vec.py | 55 ++++++++---- spacy/ml/tb_framework.py | 16 ++-- spacy/pipeline/simple_ner.py | 36 ++++---- spacy/scorer.py | 41 ++++++--- spacy/tests/doc/test_add_entities.py | 14 ++- spacy/tests/parser/test_add_label.py | 17 +++- spacy/tests/parser/test_arc_eager_oracle.py | 7 +- spacy/tests/parser/test_ner.py | 21 ++++- spacy/tests/parser/test_neural_parser.py | 7 +- spacy/tests/parser/test_nn_beam.py | 7 +- spacy/tests/parser/test_preset_sbd.py | 7 +- spacy/tests/pipeline/test_entity_linker.py | 12 ++- spacy/tests/pipeline/test_morphologizer.py | 20 ++++- spacy/tests/pipeline/test_simple_ner.py | 27 +++--- spacy/tests/regression/test_issue1501-2000.py | 7 +- spacy/tests/regression/test_issue3001-3500.py | 7 +- spacy/tests/regression/test_issue3830.py | 14 ++- spacy/tests/regression/test_issue4042.py | 7 +- spacy/tests/regression/test_issue4313.py | 7 +- spacy/tests/regression/test_issue4924.py | 1 - .../tests/serialize/test_serialize_config.py | 4 +- .../serialize/test_serialize_pipeline.py | 7 +- .../serialize/test_serialize_vocab_strings.py | 6 +- spacy/tests/test_scorer.py | 3 +- spacy/tests/test_util.py | 16 ++-- spacy/util.py | 4 +- 39 files changed, 421 insertions(+), 232 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 4f4707b52..4f4029834 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -24,8 +24,8 @@ from ..gold import Example output_dir=("Directory to write models to on each epoch", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), - resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), - epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), + resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path), + epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int), # fmt: on ) def pretrain( diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index f24feffab..6080b698b 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -3,7 +3,6 @@ from timeit import default_timer as timer import srsly from pydantic import BaseModel, FilePath -import plac import tqdm from pathlib import Path from wasabi import msg @@ -16,7 +15,9 @@ from ..gold import GoldCorpus from ..lookups import Lookups from .. import util from ..errors import Errors -from ..ml import models # don't remove - required to load the built-in architectures + +# Don't remove - required to load the built-in architectures +from ..ml import models # noqa: F401 registry = util.registry @@ -114,33 +115,19 @@ class ConfigSchema(BaseModel): extra = "allow" -@plac.annotations( - # fmt: off - train_path=("Location of JSON-formatted training data", "positional", None, Path), - dev_path=("Location of JSON-formatted development data", "positional", None, Path), - config_path=("Path to config file", "positional", None, Path), - output_path=("Output directory to store model in", "option", "o", Path), - init_tok2vec=( - "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", - Path), - raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), - verbose=("Display more information for debugging purposes", "flag", "VV", bool), - use_gpu=("Use GPU", "option", "g", int), - tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), - omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), - # fmt: on -) def train_cli( - train_path, - dev_path, - config_path, - output_path=None, - init_tok2vec=None, - raw_text=None, - verbose=False, - use_gpu=-1, - tag_map_path=None, - omit_extra_lookups=False, + # fmt: off + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + config_path: ("Path to config file", "positional", None, Path), + output_path: ("Output directory to store model in", "option", "o", Path) = None, + init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, + verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, + use_gpu: ("Use GPU", "option", "g", int) = -1, + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, + # fmt: on ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -212,7 +199,7 @@ def train( config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): - # It feels kind of weird to not have a default for this. + # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) @@ -227,7 +214,9 @@ def train( # verify textcat config if "textcat" in nlp_config["pipeline"]: textcat_labels = set(nlp.get_pipe("textcat").labels) - textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] + textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][ + "exclusive_classes" + ] # check whether the setting 'exclusive_classes' corresponds to the provided training data if textcat_multilabel: @@ -255,7 +244,9 @@ def train( "to 'false' in the config to train a classifier with classes " "that are not mutually exclusive." ) - msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") + msg.info( + f"Initialized textcat component for {len(textcat_labels)} unique labels" + ) nlp.get_pipe("textcat").labels = tuple(textcat_labels) # if 'positive_label' is provided: double check whether it's in the data and the task is binary @@ -281,9 +272,7 @@ def train( nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - nlp.begin_training( - lambda: corpus.train_examples - ) + nlp.begin_training(lambda: corpus.train_examples) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) @@ -310,8 +299,7 @@ def train( tok2vec = tok2vec.get(subpath) if not tok2vec: msg.fail( - f"Could not locate the tok2vec model at {tok2vec_path}.", - exits=1, + f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1, ) tok2vec.from_bytes(weights_data) @@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=0.0, # I think this is deprecated? + noise_level=0.0, # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], @@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): try: weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) + raise KeyError( + Errors.E983.format( + dict_name="score_weights", key=str(e), keys=list(scores.keys()) + ) + ) scores["speed"] = wps return weighted_score, scores @@ -578,15 +570,25 @@ def setup_printer(training, nlp): ] except KeyError as e: raise KeyError( - Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + Errors.E983.format( + dict_name="scores (losses)", + key=str(e), + keys=list(info["losses"].keys()), + ) + ) try: scores = [ - "{0:.2f}".format(float(info["other_scores"][col])) - for col in score_cols + "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols ] except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + raise KeyError( + Errors.E983.format( + dict_name="scores (other)", + key=str(e), + keys=list(info["other_scores"].keys()), + ) + ) data = ( [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c4944407f..7d6bfbc12 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,4 +1,3 @@ -from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups from .parts_of_speech import NAMES as UPOS_NAMES @@ -51,7 +50,13 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) - if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if not any( + ( + index_table.get(univ_pos), + exc_table.get(univ_pos), + rules_table.get(univ_pos), + ) + ): if univ_pos == "propn": return [string] else: diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index aed4fa323..c382d915b 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1 +1 @@ -from .models import * +from .models import * # noqa: F401, F403 diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 28339089a..77a2a6a77 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -1,11 +1,8 @@ """Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import List, Tuple, Dict, Optional +from typing import Dict, Optional import numpy -from thinc.api import Ops, Model, with_array, softmax_activation, padded2list -from thinc.api import to_numpy -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d - -from ..tokens import Doc +from thinc.api import Model +from thinc.types import Padded, Floats3d def BILUO() -> Model[Padded, Padded]: @@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): masks = model.ops.alloc3f(*Y.shape) max_value = Xp.data.max() for t in range(Xp.data.shape[0]): - is_last = (Xp.lengths < (t+2)).astype("i") + is_last = (Xp.lengths < (t + 2)).astype("i") masks[t] = valid_transitions[is_last, prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get large negative value - Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) + Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: @@ -83,13 +80,13 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, U_end = (L_end, L_end + n_labels) + U_start, U_end = (L_end, L_end + n_labels) # noqa: F841 # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) I_range = numpy.arange(I_start, I_end) L_range = numpy.arange(L_start, L_end) - O_action = U_end + O_action = U_end # noqa: F841 # If this is the last token and the previous action was B or I, only L # of that label is valid table[1, B_range, L_range] = 1 diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 0ce9a71e6..9f385ec0d 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -1,9 +1,7 @@ """Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, with_array, softmax_activation, padded2list -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d - -from ..tokens import Doc +from typing import Dict, Optional +from thinc.api import Ops, Model +from thinc.types import Padded, Floats3d def IOB() -> Model[Padded, Padded]: @@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): for t in range(Xp.data.shape[0]): masks[t] = valid_transitions[prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get -1*10e8 - Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) + Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: # Masking the gradient seems to do poorly here. But why? - #dY.data *= masks + # dY.data *= masks return dY return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo @@ -83,10 +81,10 @@ def _get_transition_table( B_range = ops.xp.arange(B_start, B_end) I_range = ops.xp.arange(I_start, I_end) # B and O are always valid - table[:, B_start : B_end] = 1 + table[:, B_start:B_end] = 1 table[:, O_action] = 1 # I can only follow a matching B table[B_range, I_range] = 1 - + _cache[n_actions] = table return table diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index f4b5b16fe..215cdeda1 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # # (ids < 0).T @ dY mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) + d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) return d_pad.reshape((1, nF, nO, nP)) diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index 40cde2437..dd58dab00 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,6 @@ from .entity_linker import * # noqa from .parser import * # noqa -from .simple_ner import * +from .simple_ner import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 4a360a9e6..b3a9e0815 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, - Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), + Maxout( + nO=token_vector_width * 2, + nI=token_vector_width, + nP=maxout_pieces, + dropout=0.0, + ), LayerNorm(token_vector_width * 2), softmax, ) @@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): # nO = vocab.vectors.data.shape[1] output_layer = chain( Maxout( - nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 + nO=nO, + nI=tok2vec.get_dim("nO"), + nP=maxout_pieces, + normalize=True, + dropout=0.0, ), Linear(nO=nO, nI=nO, init_W=zero_init), ) @@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update( + docs + ) # drop=drop def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index bdcd709b1..47c94cfa1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -16,18 +16,14 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - with_array(Linear(hidden_width, t2v_width)), - list2array(), - ) + tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), - nP=maxout_pieces + nP=maxout_pieces, ) if use_upper: with use_ops("numpy"): diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index 01661f55b..1fb5a71c0 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -1,9 +1,8 @@ -import functools -from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list +from typing import List +from thinc.api import Model, Linear, with_array, softmax_activation, padded2list from thinc.api import chain, list2padded, configure_normal_init from thinc.api import Dropout -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d +from thinc.types import Floats2d from ...tokens import Doc from .._biluo import BILUO @@ -12,12 +11,12 @@ from ...util import registry @registry.architectures.register("spacy.BiluoTagger.v1") -def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def BiluoTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = BILUO() linear = Linear( - nO=None, - nI=tok2vec.get_dim("nO"), - init_W=configure_normal_init(mean=0.02) + nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) ) model = chain( tok2vec, @@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L with_array(chain(Dropout(0.1), linear)), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L layers=[model, linear], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) + @registry.architectures.register("spacy.IOBTagger.v1") -def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def IOBTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = IOB() linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) model = chain( @@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis with_array(linear), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis layers=[model], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) - def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: if model.get_dim("nO") is None and Y: model.set_dim("nO", Y[0].shape[1]) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 00e268ede..7fe417321 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,5 +1,4 @@ -from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout -from thinc.api import glorot_uniform_init +from thinc.api import zero_init, with_array, Softmax, chain, Model from ...util import registry diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a02e1a5a1..9db6f982f 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,11 +1,12 @@ -from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention -from thinc.api import chain, concatenate, clone, Dropout -from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window -from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor +from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic +from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout +from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout +from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed +from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor from ..spacy_vectors import SpacyVectors from ... import util -from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER +from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams @@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") -def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, dropout, nO=None): +def build_text_classifier( + width, + embed_size, + pretrained_vectors, + exclusive_classes, + ngram_size, + window_size, + conv_depth, + dropout, + nO=None, +): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) + lower = HashEmbed( + nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout + ) + prefix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout + ) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class vectors_width = width tok2vec = vector_layer >> with_array( Maxout(width, vectors_width, normalize=True) - >> residual((expand_window(window_size=window_size) - >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, + >> residual( + ( + expand_window(window_size=window_size) + >> Maxout( + nO=width, nI=width * ((window_size * 2) + 1), normalize=True + ) + ) + ) + ** conv_depth, pad=conv_depth, ) cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention(width) - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) + tok2vec + >> list2ragged() + >> ParametricAttention(width) + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) ) linear_model = build_bow_text_classifier( - nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False + nO=nO, + ngram_size=ngram_size, + exclusive_classes=exclusive_classes, + no_output_layer=False, ) - nO_double = nO*2 if nO else None + nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: - output_layer = ( - Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() - ) + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 53798e57c..b1bed1ea1 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -99,7 +99,13 @@ def hash_charembed_cnn( @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout + pretrained_vectors, + width, + depth, + embed_size, + subword_features, + maxout_pieces, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout( - nO=width, - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): +def MultiHashEmbed( + columns, width, rows, use_subwords, pretrained_vectors, mix, dropout +): norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout + ) if pretrained_vectors: glove = StaticVectors( @@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): cnn = chain( expand_window(window_size=window_size), - Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), + Maxout( + nO=width, + nI=width * ((window_size * 2) + 1), + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) @@ -247,11 +262,19 @@ def build_Tok2Vec_model( subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) + norm = HashEmbed( + nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout + ) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout + ) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 251189389..69b40cbcf 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), - "resize_output": resize_output - } + "resize_output": resize_output, + }, ) @@ -31,14 +31,14 @@ def forward(model, X, is_train): model.layers, unseen_classes=model.attrs["unseen_classes"], train=is_train, - has_upper=model.attrs["has_upper"] + has_upper=model.attrs["has_upper"], ) return step_model, step_model.finish_steps def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize(X=X) + tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841 lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) @@ -46,7 +46,7 @@ def init(model, X=None, Y=None): def resize_output(model, new_nO): - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 lower = model.get_ref("lower") upper = model.get_ref("upper") if not model.attrs["has_upper"]: @@ -62,7 +62,7 @@ def resize_output(model, new_nO): nI = None if smaller.has_dim("nI"): nI = smaller.get_dim("nI") - with use_ops('numpy'): + with use_ops("numpy"): larger = Linear(nO=new_nO, nI=nI) larger.init = smaller.init # it could be that the model is not initialized yet, then skip this bit @@ -74,8 +74,8 @@ def resize_output(model, new_nO): # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. if smaller.has_dim("nO"): - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b + larger_W[: smaller.get_dim("nO")] = smaller_W + larger_b[: smaller.get_dim("nO")] = smaller_b for i in range(smaller.get_dim("nO"), new_nO): model.attrs["unseen_classes"].add(i) diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index c674046af..58f647b67 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -21,9 +21,7 @@ class SimpleNER(Pipe): self.model = model self.cfg = {"labels": []} self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) assert self.model is not None @@ -38,21 +36,21 @@ class SimpleNER(Pipe): def add_label(self, label): if label not in self.cfg["labels"]: self.cfg["labels"].append(label) - + def get_tag_names(self): if self.is_biluo: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - [f"L-{label}" for label in self.labels] + - [f"U-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels] + + ["O"] ) else: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + ["O"] ) def predict(self, docs: List[Doc]) -> List[Floats2d]: @@ -108,7 +106,7 @@ class SimpleNER(Pipe): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) - if not hasattr(get_examples, '__call__'): + if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples labels = _get_labels(get_examples()) @@ -117,14 +115,12 @@ class SimpleNER(Pipe): labels = self.labels n_actions = self.model.attrs["get_num_actions"](len(labels)) self.model.set_dim("nO", n_actions) - self.model.initialize() + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) return sgd @@ -135,7 +131,7 @@ class SimpleNER(Pipe): def _has_ner(eg): for ner_tag in eg.gold.ner: - if ner_tag != "-" and ner_tag != None: + if ner_tag != "-" and ner_tag is not None: return True else: return False @@ -145,7 +141,7 @@ def _get_labels(examples): labels = set() for eg in examples: for ner_tag in eg.token_annotation.entities: - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) + if ner_tag != "O" and ner_tag != "-": + _, label = ner_tag.split("-", 1) labels.add(label) return list(sorted(labels)) diff --git a/spacy/scorer.py b/spacy/scorer.py index 288da23aa..af74db80e 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -98,7 +98,9 @@ class Scorer(object): for name, component in pipeline: if name == "textcat": self.textcat_multilabel = component.model.attrs["multi_label"] - self.textcat_positive_label = component.cfg.get("positive_label", None) + self.textcat_positive_label = component.cfg.get( + "positive_label", None + ) for label in component.cfg.get("labels", []): self.textcat_auc_per_cat[label] = ROCAUCScore() self.textcat_f_per_cat[label] = PRFScore() @@ -119,19 +121,19 @@ class Scorer(object): @property def morphs_acc(self): - """RETURNS (float): Morph tag accuracy (morphological features, + """RETURNS (float): Morph tag accuracy (morphological features, i.e. `Token.morph`). """ - return self.morphs.fscore * 100 + return self.morphs.fscore * 100 @property def morphs_per_type(self): - """RETURNS (dict): Scores per dependency label. + """RETURNS (dict): Scores per dependency label. """ - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.morphs_per_feat.items() - } + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.morphs_per_feat.items() + } @property def sent_p(self): @@ -302,7 +304,15 @@ class Scorer(object): gold_morphs_per_feat = {} gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): + for id_, tag, pos, morph, head, dep, sent_start in zip( + orig.ids, + orig.tags, + orig.pos, + orig.morphs, + orig.heads, + orig.deps, + orig.sent_starts, + ): gold_tags.add((id_, tag)) gold_pos.add((id_, pos)) gold_morphs.add((id_, morph)) @@ -400,7 +410,10 @@ class Scorer(object): self.pos.score_set(cand_pos, gold_pos) self.morphs.score_set(cand_morphs, gold_morphs) for field in self.morphs_per_feat: - self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) + self.morphs_per_feat[field].score_set( + cand_morphs_per_feat.get(field, set()), + gold_morphs_per_feat.get(field, set()), + ) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: @@ -412,7 +425,9 @@ class Scorer(object): ) if ( len(gold.cats) > 0 - and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) + and set(self.textcat_f_per_cat) + == set(self.textcat_auc_per_cat) + == set(gold.cats) and set(gold.cats) == set(doc.cats) ): goldcat = max(gold.cats, key=gold.cats.get) @@ -424,10 +439,10 @@ class Scorer(object): ) for label in set(gold.cats): self.textcat_auc_per_cat[label].score_set( - doc.cats[label], gold.cats[label] + doc.cats[label], gold.cats[label] ) self.textcat_f_per_cat[label].score_set( - set([label]) & set([candcat]), set([label]) & set([goldcat]) + set([label]) & set([candcat]), set([label]) & set([goldcat]) ) elif len(self.textcat_f_per_cat) > 0: model_labels = set(self.textcat_f_per_cat) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 879334056..b9c230516 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) @@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index f9663ba32..893465b45 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,9 +1,8 @@ import pytest -from thinc.api import Adam, NumpyOps +from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab - from spacy.pipeline.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer @@ -17,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) return parser @@ -58,7 +62,12 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1.add_label("C") ner1.add_label("B") diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 5d265261f..42b62251e 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -138,7 +138,12 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(doc.vocab, default_parser(), **config) parser.moves.add_action(0, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b0a8109dc..e82de03bf 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -138,7 +138,12 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -157,7 +162,12 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) # set "New York" to a blocked entity @@ -215,7 +225,12 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2.moves.add_action(5, "") ner2.add_label("GPE") diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 7f3e981ea..d88517fb5 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -28,7 +28,12 @@ def tok2vec(): @pytest.fixture def parser(vocab, arc_eager): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } return Parser(vocab, model=default_parser(), moves=arc_eager, **config) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index fa5d59f9e..841eb058c 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index ccf7d3ba3..37a9136aa 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -16,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 32b434e04..62c7fbf17 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() - nlp.add_pipe(nlp.create_pipe('sentencizer')) + nlp.add_pipe(nlp.create_pipe("sentencizer")) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data ruler = EntityRuler(nlp) - patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} + ] ruler.add_patterns(patterns) nlp.add_pipe(ruler) @@ -285,7 +287,11 @@ def test_overfitting_IO(): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index f9307afc2..f052c4380 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -15,8 +15,17 @@ def test_label_types(): TRAIN_DATA = [ - ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), - ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), + ( + "I like green eggs", + { + "morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], + "pos": ["NOUN", "VERB", "ADJ", "NOUN"], + }, + ), + ( + "Eat blue ham", + {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, + ), ] @@ -38,7 +47,12 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] + gold_morphs = [ + "Feat=N|POS=NOUN", + "Feat=V|POS=VERB", + "Feat=J|POS=ADJ", + "Feat=N|POS=NOUN", + ] assert gold_morphs == [t.morph_ for t in doc] # Also test the results are still the same after IO diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index 9d4acf2fd..024d7bd26 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -1,30 +1,31 @@ import pytest from collections import namedtuple - from thinc.api import NumpyOps from spacy.ml._biluo import BILUO, _get_transition_table -from spacy.pipeline.simple_ner import SimpleNER -import spacy -@pytest.fixture(params=[ - ["PER", "ORG", "LOC", "MISC"], - ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] -]) +@pytest.fixture( + params=[ + ["PER", "ORG", "LOC", "MISC"], + ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"], + ] +) def labels(request): return request.param + @pytest.fixture def ops(): return NumpyOps() + def _get_actions(labels): action_names = ( - [f"B{label}" for label in labels] + \ - [f"I{label}" for label in labels] + \ - [f"L{label}" for label in labels] + \ - [f"U{label}" for label in labels] + \ - ["O"] + [f"B{label}" for label in labels] + + [f"I{label}" for label in labels] + + [f"L{label}" for label in labels] + + [f"U{label}" for label in labels] + + ["O"] ) A = namedtuple("actions", action_names) return A(**{name: i for i, name in enumerate(action_names)}) @@ -228,7 +229,7 @@ def test_transition_table(ops): assert table[0, a.O, a.Uloc] == 1 assert table[0, a.O, a.Uorg] == 1 assert table[0, a.O, a.O] == 1 - + # Last token, prev action was B assert table[1, a.Bper, a.Bper] == 0 assert table[1, a.Bper, a.Bloc] == 0 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 177b6bb3d..6a2d16733 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example(doc=None) example.set_token_annotation( diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 6df437b3c..a37707379 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -196,7 +196,12 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(doc.vocab, default_ner(), **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 15632bdf8..06b7893a7 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels @@ -16,7 +21,12 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 4978aba44..f47290b92 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -74,7 +74,12 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 946316d85..5e2764618 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -12,7 +12,12 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner.add_label("SOME_LABEL") ner.begin_training([]) diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index b240f6d4a..10c7868a0 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -1,4 +1,3 @@ -import pytest from spacy.language import Language diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 870a980f2..cfb9d7381 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -112,7 +112,7 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 upper = model.get_ref("upper") # check that we have the correct settings, not the default ones @@ -132,7 +132,7 @@ def test_serialize_parser(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 upper = model.get_ref("upper") # check that we have the correct settings, not the default ones diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 9c4e1f61e..abb5ccb27 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(en_vocab, default_parser(), **config) parser.add_label("nsubj") return parser diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index d3e82296e..e570b1025 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE - assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) + assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE + assert sorted([s for s in new_vocab1.strings]) == sorted( + strings1 + list(default_strings) + ) @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index d750a8202..2e1cf2730 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -40,6 +40,7 @@ test_ner_apple = [ ] ] + @pytest.fixture def tagged_doc(): text = "Sarah's sister flew to Silicon Valley via London." @@ -184,7 +185,7 @@ def test_tag_score(tagged_doc): tagged_doc, tags=[t.tag_ for t in tagged_doc], pos=[t.pos_ for t in tagged_doc], - morphs=[t.morph_ for t in tagged_doc] + morphs=[t.morph_ for t in tagged_doc], ) scorer.score((tagged_doc, gold)) results = scorer.scores diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 1410755db..a7258449d 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 200], [3, 2]), ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), @@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches): examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) + batches = list( + minibatch_by_words( + examples=examples, size=batch_size, tolerance=tol, discard_oversize=True + ) + ) assert [len(batch) for batch in batches] == expected_batches max_size = batch_size + batch_size * tol @@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) + batches = list( + minibatch_by_words( + examples=examples, size=batch_size, tolerance=tol, discard_oversize=False + ) + ) assert [len(batch) for batch in batches] == expected_batches - - diff --git a/spacy/util.py b/spacy/util.py index d2d87bef9..ad3dc3635 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -697,7 +697,9 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): +def minibatch_by_words( + examples, size, count_words=len, tolerance=0.2, discard_oversize=False +): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by themselves, or be discarded if discard_oversize=True.""" From f91e9e8c8437020505c8af07ff9e123ef5324293 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 20 Jun 2020 14:47:17 +0200 Subject: [PATCH 187/187] Remove F841 [ci skip] --- spacy/ml/_biluo.py | 4 ++-- spacy/ml/tb_framework.py | 4 ++-- spacy/tests/serialize/test_serialize_config.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 77a2a6a77..ab2bd9e10 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -80,13 +80,13 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, U_end = (L_end, L_end + n_labels) # noqa: F841 + U_start, U_end = (L_end, L_end + n_labels) # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) I_range = numpy.arange(I_start, I_end) L_range = numpy.arange(L_start, L_end) - O_action = U_end # noqa: F841 + O_action = U_end # If this is the last token and the previous action was B or I, only L # of that label is valid table[1, B_range, L_range] = 1 diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 69b40cbcf..f7dad565e 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -38,7 +38,7 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841 + tok2vec = model.get_ref("tok2vec").initialize(X=X) lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) @@ -46,7 +46,7 @@ def init(model, X=None, Y=None): def resize_output(model, new_nO): - tok2vec = model.get_ref("tok2vec") # noqa: F841 + tok2vec = model.get_ref("tok2vec") lower = model.get_ref("lower") upper = model.get_ref("upper") if not model.attrs["has_upper"]: diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index cfb9d7381..870a980f2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -112,7 +112,7 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") # noqa: F841 + tok2vec = model.get_ref("tok2vec") upper = model.get_ref("upper") # check that we have the correct settings, not the default ones @@ -132,7 +132,7 @@ def test_serialize_parser(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") # noqa: F841 + tok2vec = model.get_ref("tok2vec") upper = model.get_ref("upper") # check that we have the correct settings, not the default ones