From c705a284382fe7fba5cc367ef20adff36ae00cb7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 19 Jun 2020 11:22:24 +0200 Subject: [PATCH 01/49] add links to to_dict --- spacy/gold/example.pyx | 10 +++++++++- spacy/tests/test_gold.py | 7 +++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 402228994..b5d1b1402 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -126,7 +126,7 @@ cdef class Example: "doc_annotation": { "cats": dict(self.reference.cats), "entities": biluo_tags_from_doc(self.reference), - "links": [], # TODO + "links": self._links_to_dict() }, "token_annotation": { "ids": [t.i+1 for t in self.reference], @@ -141,6 +141,14 @@ cdef class Example: } } + def _links_to_dict(self): + links = {} + for ent in self.reference.ents: + if ent.kb_id_: + links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0} + return links + + def split_sents(self): """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d98a93f2f..9e63f8a98 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -200,13 +200,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): words = ["I flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}} + offset_start = len("I flew to ") + offset_end = len("I flew to San Francisco Valley") + entities = [(offset_start, offset_end, "LOC")] + links = {(offset_start, offset_end): {"Q816843": 1.0}} gold_words = ["I", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links}) assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""] assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""] + assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0} # additional whitespace tokens in GoldParse words words, spaces = get_words_and_spaces( From 25b0674320c7fcb49921b484129c7e6d4bece272 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 19 Jun 2020 11:31:01 +0200 Subject: [PATCH 02/49] clean up --- spacy/language.py | 14 -------------- spacy/syntax/arc_eager.pyx | 6 +++--- spacy/syntax/nonproj.pyx | 4 ++-- spacy/tests/test_gold.py | 2 ++ 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index c168afeea..b9a84e1bb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -646,20 +646,6 @@ class Language(object): sgd(W, dW, key=key) return losses - def preprocess_gold(self, examples): - """Can be called before training to pre-process gold data. By default, - it handles nonprojectivity and adds missing tags to the tag map. - - examples (iterable): `Example` objects. - YIELDS (tuple): `Example` objects. - """ - # TODO: This is deprecated right? - for name, proc in self.pipeline: - if hasattr(proc, "preprocess_gold"): - examples = proc.preprocess_gold(examples) - for eg in examples: - yield eg - def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 960f9f2c2..1c4484c33 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -459,9 +459,9 @@ cdef class ArcEager(TransitionSystem): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for example in kwargs.get('gold_parses', []): - heads, labels = nonproj.projectivize(example.token_annotation.heads, - example.token_annotation.deps) - for child, head, label in zip(example.token_annotation.ids, heads, labels): + heads, labels = nonproj.projectivize(example.get_aligned("HEAD"), + example.get_aligned("DEP")) + for child, head, label in zip(example.get_aligned("ID"), heads, labels): if label.upper() == 'ROOT' : label = 'ROOT' if head == child: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 5b1f57d2b..eded53fac 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -78,8 +78,8 @@ def is_decorated(label): def count_decorated_labels(gold_data): freqs = {} for example in gold_data: - proj_heads, deco_deps = projectivize(example.token_annotation.heads, - example.token_annotation.deps) + proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"), + example.get_aligned("DEP")) # set the label to ROOT for each root dependent deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 9e63f8a98..f76b0c1e1 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -497,6 +497,8 @@ def test_split_sents(merged_dict): Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), merged_dict ) + assert example.text == "Hi there everyone It is just me" + assert len(get_parses_from_example( example, merge=False, From e30ec9b2a8beebe988e0ecce944ac40a8918c4f9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 19 Jun 2020 14:05:35 +0200 Subject: [PATCH 03/49] fix test checking for variants --- examples/experiments/onto-joint/defaults.cfg | 1 - examples/experiments/onto-joint/pretrain.cfg | 1 - .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 1 - .../ptb-joint-pos-dep/defaults.cfg | 1 - spacy/cli/__init__.py | 2 +- spacy/cli/{train_from_config.py => train.py} | 1 - spacy/gold/augment.py | 29 ++++++----------- spacy/gold/corpus.py | 32 ++++++++----------- spacy/tests/test_gold.py | 18 ++--------- 9 files changed, 27 insertions(+), 59 deletions(-) rename spacy/cli/{train_from_config.py => train.py} (99%) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index f76336d84..337fe0379 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 40885b6e8..83991f888 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 905b5b4e0..f1b702a4e 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = 0 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 7383116e7..1c946ac60 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = -1 diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2ffbe2d0c..6f09c6884 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,7 +4,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train_from_config import train_cli # noqa: F401 +from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train.py similarity index 99% rename from spacy/cli/train_from_config.py rename to spacy/cli/train.py index 3a4d28356..fb4347158 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train.py @@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=cfg["noise_level"], # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index a129793c8..dda51cda6 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -2,6 +2,15 @@ import random import itertools +def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming + raw_text = example.text + orig_dict = example.to_dict() + variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level) + doc = nlp.make_doc(variant_text) + orig_dict["token_annotation"] = variant_token_annot + return example.from_dict(doc, orig_dict) + + def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw_text, orig_token_dict @@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): raw_idx += 1 raw = variant_raw return raw, token_dict - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return "".join(_corrupt(c, noise_level) for c in orig) - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c in [".", "'", "!", "?", ","]: - return "\n" - else: - return c.lower() diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index d55845fb8..c84f8355f 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -8,7 +8,7 @@ from ..tokens import Doc from .. import util from ..errors import Errors, AlignmentError from .gold_io import read_json_file, json_to_annotations -from .augment import make_orth_variants, add_noise +from .augment import make_orth_variants from .example import Example @@ -148,7 +148,6 @@ class GoldCorpus(object): nlp, gold_preproc=False, max_length=None, - noise_level=0.0, orth_variant_level=0.0, ignore_misaligned=False, ): @@ -160,7 +159,6 @@ class GoldCorpus(object): train_annotations, gold_preproc, max_length=max_length, - noise_level=noise_level, orth_variant_level=orth_variant_level, make_projective=True, ignore_misaligned=ignore_misaligned, @@ -194,33 +192,31 @@ class GoldCorpus(object): annotations, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, make_projective=False, ignore_misaligned=False, ): """ Setting gold_preproc will result in creating a doc per sentence """ for eg_dict in annotations: + token_annot = eg_dict.get("token_annotation", {}) if eg_dict["text"]: - example = Example.from_dict( - nlp.make_doc(eg_dict["text"]), - eg_dict - ) + doc = nlp.make_doc(eg_dict["text"]) + elif "words" in token_annot: + doc = Doc(nlp.vocab, words=token_annot["words"]) else: - example = Example.from_dict( - Doc(nlp.vocab, words=eg_dict["words"]), - eg_dict - ) + raise ValueError("Expecting either 'text' or token_annotation.words annotation") + if gold_preproc: - # TODO: Data augmentation + variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level) + doc = nlp.make_doc(variant_text) + eg_dict["token_annotation"] = variant_token_annot + example = Example.from_dict(doc, eg_dict) examples = example.split_sents() + else: + example = Example.from_dict(doc, eg_dict) examples = [example] + for eg in examples: if (not max_length) or len(eg.predicted) < max_length: - if ignore_misaligned: - try: - _ = eg._deprecated_get_gold() - except AlignmentError: - continue yield eg diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index f76b0c1e1..726492138 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -11,6 +11,7 @@ import pytest import srsly from .util import make_tempdir +from ..gold.augment import make_orth_variants_example @pytest.fixture @@ -387,8 +388,8 @@ def test_make_orth_variants(doc): goldcorpus = GoldCorpus(str(json_file), str(json_file)) # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) - train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] + train_example = next(goldcorpus.train_dataset(nlp)) + variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) @pytest.mark.parametrize( @@ -499,19 +500,6 @@ def test_split_sents(merged_dict): ) assert example.text == "Hi there everyone It is just me" - assert len(get_parses_from_example( - example, - merge=False, - vocab=nlp.vocab, - make_projective=False) - ) == 2 - assert len(get_parses_from_example( - example, - merge=True, - vocab=nlp.vocab, - make_projective=False - )) == 1 - split_examples = example.split_sents() assert len(split_examples) == 2 assert split_examples[0].text == "Hi there everyone " From 161d8439fab3f2635f20bfc3fb1efa491a359722 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:19:40 +0200 Subject: [PATCH 04/49] Start updating converters --- spacy/cli/converters/__init__.py | 4 +- spacy/cli/converters/conll_ner2json.py | 59 ++++++++++++++------------ spacy/cli/converters/iob2json.py | 40 ++++++----------- spacy/cli/converters/jsonl2json.py | 9 ++-- 4 files changed, 51 insertions(+), 61 deletions(-) diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index 9dcbf5b13..e44ad407d 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,4 +1,4 @@ from .conllu2json import conllu2json # noqa: F401 -from .iob2json import iob2json # noqa: F401 +from .iob2json import iob2docs # noqa: F401 from .conll_ner2json import conll_ner2json # noqa: F401 -from .jsonl2json import ner_jsonl2json # noqa: F401 +from .jsonl2docs import ner_jsonl2json # noqa: F401 diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index b607d5913..8d4139bde 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -3,15 +3,16 @@ from wasabi import Printer from ...gold import iob_to_biluo from ...lang.xx import MultiLanguage from ...tokens.doc import Doc +from ...vocab import Vocab from ...util import load_model -def conll_ner2json( +def conll_ner2doc( input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs ): """ Convert files in the CoNLL-2003 NER format and similar - whitespace-separated columns into JSON format for use with train cli. + whitespace-separated columns into Doc objects. The first column is the tokens, the final column is the IOB tags. If an additional second column is present, the second column is the tags. @@ -81,17 +82,25 @@ def conll_ner2json( "No document delimiters found. Use `-n` to automatically group " "sentences into documents." ) + + if model: + nlp = load_model(model) + else: + nlp = MultiLanguage() output_docs = [] - for doc in input_data.strip().split(doc_delimiter): - doc = doc.strip() - if not doc: + for conll_doc in input_data.strip().split(doc_delimiter): + conll_doc = conll_doc.strip() + if not conll_doc: continue - output_doc = [] - for sent in doc.split("\n\n"): - sent = sent.strip() + words = [] + sent_starts = [] + pos_tags = [] + biluo_tags = [] + for conll_sent in conll_doc.split("\n\n"): + conll_sent = conll_sent.strip() if not sent: continue - lines = [line.strip() for line in sent.split("\n") if line.strip()] + lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: raise ValueError( @@ -99,25 +108,19 @@ def conll_ner2json( "Try checking whitespace and delimiters. See " "https://spacy.io/api/cli#convert" ) - words = cols[0] - iob_ents = cols[-1] - if len(cols) > 2: - tags = cols[1] - else: - tags = ["-"] * len(words) - biluo_ents = iob_to_biluo(iob_ents) - output_doc.append( - { - "tokens": [ - {"orth": w, "tag": tag, "ner": ent} - for (w, tag, ent) in zip(words, tags, biluo_ents) - ] - } - ) - output_docs.append( - {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]} - ) - output_doc = [] + length = len(cols[0]) + words.extend(cols[0]) + sent_stats.extend([True] + [False] * (length - 1)) + biluo_tags.extend(iob_to_biluo(cols[-1])) + pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length) + + doc = Doc(nlp.vocab, words=words) + for i, token in enumerate(doc): + token.tag_ = pos_tags[i] + token.is_sent_start = sent_starts[i] + entities = tags_to_entities(biluo_tags) + doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities] + output_docs.append(doc) return output_docs diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index b6ac234fc..2addc1af4 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,14 +1,15 @@ from wasabi import Printer -from ...gold import iob_to_biluo +from ...gold import iob_to_biluo, tags_to_entities from ...util import minibatch +from .util import merge_sentences from .conll_ner2json import n_sents_info -def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs): +def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): """ Convert IOB files with one sentence per line and tags separated with '|' - into JSON format for use with train cli. IOB and IOB2 are accepted. + into Doc objects so they can be saved. IOB and IOB2 are accepted. Sample formats: @@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs): def read_iob(raw_sents): - sentences = [] + docs = [] for line in raw_sents: if not line.strip(): continue tokens = [t.split("|") for t in line.split()] if len(tokens[0]) == 3: - words, pos, iob = zip(*tokens) + words, tags, iob = zip(*tokens) elif len(tokens[0]) == 2: words, iob = zip(*tokens) - pos = ["-"] * len(words) + tags = ["-"] * len(words) else: raise ValueError( "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" ) + doc = Doc(vocab, words=words) + for i, tag in enumerate(pos): + doc[i].tag_ = tag biluo = iob_to_biluo(iob) - sentences.append( - [ - {"orth": w, "tag": p, "ner": ent} - for (w, p, ent) in zip(words, pos, biluo) - ] - ) - sentences = [{"tokens": sent} for sent in sentences] - paragraphs = [{"sentences": [sent]} for sent in sentences] - docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)] + entities = biluo_tags_to_entities(biluo) + doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities] + docs.append(doc) return docs - - -def merge_sentences(docs, n_sents): - merged = [] - for group in minibatch(docs, size=n_sents): - group = list(group) - first = group.pop(0) - to_extend = first["paragraphs"][0]["sentences"] - for sent in group: - to_extend.extend(sent["paragraphs"][0]["sentences"]) - merged.append(first) - return merged diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 525063b22..8639a11b9 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -4,15 +4,17 @@ from ...gold import docs_to_json from ...util import get_lang_class, minibatch -def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_): +def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_): if lang is None: raise ValueError("No --lang specified, but tokenization required") - json_docs = [] + docs = [] input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] nlp = get_lang_class(lang)() sentencizer = nlp.create_pipe("sentencizer") for i, batch in enumerate(minibatch(input_examples, size=n_sents)): docs = [] + # TODO: Should we be merging these? We're disrespecting the n_sents + # currently. for record in batch: raw_text = record["text"] if "entities" in record: @@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_) spans = [doc.char_span(s, e, label=L) for s, e, L in ents] doc.ents = _cleanup_spans(spans) docs.append(doc) - json_docs.append(docs_to_json(docs, id=i)) - return json_docs + return docs def _cleanup_spans(spans): From c630cfdb5e28a8dbb1126e8e90e0574516fe177b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:20:34 +0200 Subject: [PATCH 05/49] Move converters under spacy.gold --- spacy/{cli => gold}/converters/__init__.py | 0 spacy/{cli => gold}/converters/conll_ner2json.py | 0 spacy/{cli => gold}/converters/conllu2json.py | 0 spacy/{cli => gold}/converters/iob2json.py | 0 spacy/{cli => gold}/converters/jsonl2json.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename spacy/{cli => gold}/converters/__init__.py (100%) rename spacy/{cli => gold}/converters/conll_ner2json.py (100%) rename spacy/{cli => gold}/converters/conllu2json.py (100%) rename spacy/{cli => gold}/converters/iob2json.py (100%) rename spacy/{cli => gold}/converters/jsonl2json.py (100%) diff --git a/spacy/cli/converters/__init__.py b/spacy/gold/converters/__init__.py similarity index 100% rename from spacy/cli/converters/__init__.py rename to spacy/gold/converters/__init__.py diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/gold/converters/conll_ner2json.py similarity index 100% rename from spacy/cli/converters/conll_ner2json.py rename to spacy/gold/converters/conll_ner2json.py diff --git a/spacy/cli/converters/conllu2json.py b/spacy/gold/converters/conllu2json.py similarity index 100% rename from spacy/cli/converters/conllu2json.py rename to spacy/gold/converters/conllu2json.py diff --git a/spacy/cli/converters/iob2json.py b/spacy/gold/converters/iob2json.py similarity index 100% rename from spacy/cli/converters/iob2json.py rename to spacy/gold/converters/iob2json.py diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/gold/converters/jsonl2json.py similarity index 100% rename from spacy/cli/converters/jsonl2json.py rename to spacy/gold/converters/jsonl2json.py From f61d5e3ac354df372cc6320482626856ea027135 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:23:58 +0200 Subject: [PATCH 06/49] Move things around --- spacy/gold/converters/__init__.py | 8 ++++---- .../converters/{conll_ner2json.py => conll_ner2doc.py} | 0 spacy/gold/converters/{iob2json.py => iob2doc.py} | 0 spacy/gold/converters/{jsonl2json.py => jsonl2docs.py} | 0 spacy/gold/converters/util.py | 5 +++++ 5 files changed, 9 insertions(+), 4 deletions(-) rename spacy/gold/converters/{conll_ner2json.py => conll_ner2doc.py} (100%) rename spacy/gold/converters/{iob2json.py => iob2doc.py} (100%) rename spacy/gold/converters/{jsonl2json.py => jsonl2docs.py} (100%) create mode 100644 spacy/gold/converters/util.py diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index e44ad407d..a046466fc 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,4 +1,4 @@ -from .conllu2json import conllu2json # noqa: F401 -from .iob2json import iob2docs # noqa: F401 -from .conll_ner2json import conll_ner2json # noqa: F401 -from .jsonl2docs import ner_jsonl2json # noqa: F401 +from .conllu2docs import conllu2docs # noqa: F401 +from .iob2docs import iob2docs # noqa: F401 +from .conll_ner2docs import conll_ner2docs # noqa: F401 +from .jsonl2docs import ner_jsonl2docs # noqa: F401 diff --git a/spacy/gold/converters/conll_ner2json.py b/spacy/gold/converters/conll_ner2doc.py similarity index 100% rename from spacy/gold/converters/conll_ner2json.py rename to spacy/gold/converters/conll_ner2doc.py diff --git a/spacy/gold/converters/iob2json.py b/spacy/gold/converters/iob2doc.py similarity index 100% rename from spacy/gold/converters/iob2json.py rename to spacy/gold/converters/iob2doc.py diff --git a/spacy/gold/converters/jsonl2json.py b/spacy/gold/converters/jsonl2docs.py similarity index 100% rename from spacy/gold/converters/jsonl2json.py rename to spacy/gold/converters/jsonl2docs.py diff --git a/spacy/gold/converters/util.py b/spacy/gold/converters/util.py new file mode 100644 index 000000000..ed9c84203 --- /dev/null +++ b/spacy/gold/converters/util.py @@ -0,0 +1,5 @@ +def merge_sentences(docs, n_sents): + merged = [] + for group in minibatch(docs, size=n_sents): + raise NotImplementedError + return merged From e20a7808672816e8c7c936a3ace63c126c95ff41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:24:49 +0200 Subject: [PATCH 07/49] Fix naming --- spacy/gold/converters/{conll_ner2doc.py => conll_ner2docs.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/gold/converters/{conll_ner2doc.py => conll_ner2docs.py} (100%) diff --git a/spacy/gold/converters/conll_ner2doc.py b/spacy/gold/converters/conll_ner2docs.py similarity index 100% rename from spacy/gold/converters/conll_ner2doc.py rename to spacy/gold/converters/conll_ner2docs.py From d9a8fdf4b74cf65ad31f28f9a8ee1f20de6fb2fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:26:36 +0200 Subject: [PATCH 08/49] Fix name --- spacy/gold/converters/{iob2doc.py => iob2docs.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/gold/converters/{iob2doc.py => iob2docs.py} (100%) diff --git a/spacy/gold/converters/iob2doc.py b/spacy/gold/converters/iob2docs.py similarity index 100% rename from spacy/gold/converters/iob2doc.py rename to spacy/gold/converters/iob2docs.py From 3a73d95dccba3d9f04323000ceb438bad0471ea4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:50:13 +0200 Subject: [PATCH 09/49] Update converter to produce DocBin --- spacy/cli/convert.py | 65 ++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 2ffbeb458..e2b6efc33 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -2,6 +2,7 @@ from pathlib import Path from wasabi import Printer import srsly import re +import sys from .converters import conllu2json, iob2json, conll_ner2json from .converters import ner_jsonl2json @@ -11,15 +12,29 @@ from .converters import ner_jsonl2json # matched by file extension and content. To add a converter, add a new # entry to this dict with the file extension mapped to the converter function # imported from /converters. -CONVERTERS = { - "conllubio": conllu2json, - "conllu": conllu2json, - "conll": conllu2json, - "ner": conll_ner2json, - "iob": iob2json, - "jsonl": ner_jsonl2json, + +DOC_CONVERTERS = { + "conllubio": conllu2doc, + "conllu": conllu2doc, + "conll": conllu2doc, + "ner": conll_ner2doc, + "iob": iob2doc, + "jsonl": ner_jsonl2doc, + "json": json2docs, } + +ALL_ATTRS = [ + "ORTH", + "TAG", + "HEAD", + "DEP", + "SENT_START", + "ENT_IOB", + "ENT_TYPE", + "LEMMA", + "MORPH", +] # File types FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES_STDOUT = ("json", "jsonl") @@ -82,7 +97,7 @@ def convert( ner_map = srsly.read_json(ner_map_path) # Use converter function to convert data func = CONVERTERS[converter] - data = func( + docs = func( input_data, n_sents=n_sents, seg_sents=seg_sents, @@ -93,23 +108,27 @@ def convert( no_print=no_print, ner_map=ner_map, ) - if output_dir != "-": - # Export data to a file - suffix = f".{file_type}" - output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) - if file_type == "json": - srsly.write_json(output_file, data) - elif file_type == "jsonl": - srsly.write_jsonl(output_file, data) - elif file_type == "msg": - srsly.write_msgpack(output_file, data) - msg.good(f"Generated output file ({len(data)} documents): {output_file}") + if write_json: + data = docs2json(docs) else: - # Print to stdout - if file_type == "json": + data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes() + + if output_dir == "-": + if write_json: srsly.write_json("-", data) - elif file_type == "jsonl": - srsly.write_jsonl("-", data) + else: + sys.stdout.write(data) + else: + # Export data to a file + if write_json: + suffix = f".{file_type}" + output_file = output_dir / input_path.parts[-1].with_suffix(suffix) + srsly.write_json(output_file, data) + else: + output_file = output_dir / input_path.parts[-1].with_suffix("spacy") + with output_file.open("wb") as file_: + file_.write(data) + msg.good(f"Generated output file ({len(data)} documents): {output_file}") def autodetect_ner_format(input_data): From 95df02875827d9abd93664e164bdb4bed5468a73 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:50:23 +0200 Subject: [PATCH 10/49] Update converters --- spacy/gold/converters/__init__.py | 4 +++- spacy/gold/converters/conll_ner2docs.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index a046466fc..6ccc0f8f5 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,4 +1,6 @@ -from .conllu2docs import conllu2docs # noqa: F401 from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 from .jsonl2docs import ner_jsonl2docs # noqa: F401 + +# TODO: Update this one +#from .conllu2docs import conllu2docs # noqa: F401 diff --git a/spacy/gold/converters/conll_ner2docs.py b/spacy/gold/converters/conll_ner2docs.py index 8d4139bde..7042bd7d6 100644 --- a/spacy/gold/converters/conll_ner2docs.py +++ b/spacy/gold/converters/conll_ner2docs.py @@ -7,7 +7,7 @@ from ...vocab import Vocab from ...util import load_model -def conll_ner2doc( +def conll_ner2docs( input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs ): """ From 0d22c6e006e27b34351b6b7ff361f367628fade2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 03:50:36 +0200 Subject: [PATCH 11/49] Allow DocBin to take list of Doc objects. --- spacy/tokens/_serialize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d3f49550c..7bf3faab3 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -39,7 +39,7 @@ class DocBin(object): document from the DocBin. """ - def __init__(self, attrs=None, store_user_data=False): + def __init__(self, attrs=None, store_user_data=False, docs=[]): """Create a DocBin object to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are @@ -59,6 +59,8 @@ class DocBin(object): self.user_data = [] self.strings = set() self.store_user_data = store_user_data + for doc in docs: + self.add(docs) def __len__(self): """RETURNS: The number of Doc objects added to the DocBin.""" From 7a846921a36706b58a3ceea1a89e58407956b68b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 15:55:35 +0200 Subject: [PATCH 12/49] Make spacy convert output docbin --- spacy/cli/convert.py | 223 ++++++++++++++++++++++++++++--------------- 1 file changed, 144 insertions(+), 79 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e2b6efc33..4cf960379 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -4,8 +4,9 @@ import srsly import re import sys -from .converters import conllu2json, iob2json, conll_ner2json -from .converters import ner_jsonl2json +from ..tokens import DocBin +from ..gold.converters import iob2docs, conll_ner2docs, json2docs +from ..gold.converters import ner_jsonl2docs # Converters are matched by file extension except for ner/iob, which are @@ -13,13 +14,13 @@ from .converters import ner_jsonl2json # entry to this dict with the file extension mapped to the converter function # imported from /converters. -DOC_CONVERTERS = { - "conllubio": conllu2doc, - "conllu": conllu2doc, - "conll": conllu2doc, - "ner": conll_ner2doc, - "iob": iob2doc, - "jsonl": ner_jsonl2doc, +CONVERTERS = { + #"conllubio": conllu2docs, TODO + #"conllu": conllu2docs, TODO + #"conll": conllu2docs, TODO + "ner": conll_ner2docs, + "iob": iob2docs, + "jsonl": ner_jsonl2docs, "json": json2docs, } @@ -42,93 +43,58 @@ FILE_TYPES_STDOUT = ("json", "jsonl") def convert( # fmt: off - input_file: ("Input file", "positional", None, str), - output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", - file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", + input_path: ("Input file or directory", "positional", None, Path), + output_dir: ("Output directory.", "positional", None, Path), + file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy", n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", - ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, + ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, lang: ("Language (if tokenizer required)", "option", "l", str) = None, # fmt: on ): """ - Convert files into JSON format for use with train command and other - experiment management functions. If no output_dir is specified, the data - is written to stdout, so you can pipe them forward to a JSON file: - $ spacy convert some_file.conllu > some_file.json + Convert files into json or DocBin format for use with train command and other + experiment management functions. """ + cli_args = locals() no_print = output_dir == "-" + output_dir = Path(output_dir) if output_dir != "-" else "-" msg = Printer(no_print=no_print) - input_path = Path(input_file) - if file_type not in FILE_TYPES_STDOUT and output_dir == "-": - # TODO: support msgpack via stdout in srsly? - msg.fail( - f"Can't write .{file_type} data to stdout", - "Please specify an output directory.", - exits=1, + verify_cli_args(msg, **cli_args) + converter = _get_converter(msg, converter, input_path) + ner_map = srsly.read_json(ner_map) if ner_map is not None else None + for input_loc in walk_directory(input_path): + input_data = input_loc.open("r", encoding="utf-8").read() + # Use converter function to convert data + func = CONVERTERS[converter] + docs = func( + input_data, + n_sents=n_sents, + seg_sents=seg_sents, + append_morphology=morphology, + merge_subtokens=merge_subtokens, + lang=lang, + model=model, + no_print=no_print, + ner_map=ner_map, ) - if not input_path.exists(): - msg.fail("Input file not found", input_path, exits=1) - if output_dir != "-" and not Path(output_dir).exists(): - msg.fail("Output directory not found", output_dir, exits=1) - input_data = input_path.open("r", encoding="utf-8").read() - if converter == "auto": - converter = input_path.suffix[1:] - if converter == "ner" or converter == "iob": - converter_autodetect = autodetect_ner_format(input_data) - if converter_autodetect == "ner": - msg.info("Auto-detected token-per-line NER format") - converter = converter_autodetect - elif converter_autodetect == "iob": - msg.info("Auto-detected sentence-per-line NER format") - converter = converter_autodetect + suffix = f".{file_type}" + subpath = input_loc.relative_to(input_path) + output_file = (output_dir / subpath).with_suffix(suffix) + if not output_file.parent.exists(): + output_file.parent.mkdir(parents=True) + if file_type == "json": + data = docs2json(docs) + srsly.write_json(output_file, docs2json(docs)) else: - msg.warn( - "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" - ) - if converter not in CONVERTERS: - msg.fail(f"Can't find converter for {converter}", exits=1) - ner_map = None - if ner_map_path is not None: - ner_map = srsly.read_json(ner_map_path) - # Use converter function to convert data - func = CONVERTERS[converter] - docs = func( - input_data, - n_sents=n_sents, - seg_sents=seg_sents, - append_morphology=morphology, - merge_subtokens=merge_subtokens, - lang=lang, - model=model, - no_print=no_print, - ner_map=ner_map, - ) - if write_json: - data = docs2json(docs) - else: - data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes() - - if output_dir == "-": - if write_json: - srsly.write_json("-", data) - else: - sys.stdout.write(data) - else: - # Export data to a file - if write_json: - suffix = f".{file_type}" - output_file = output_dir / input_path.parts[-1].with_suffix(suffix) - srsly.write_json(output_file, data) - else: - output_file = output_dir / input_path.parts[-1].with_suffix("spacy") + data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes() with output_file.open("wb") as file_: file_.write(data) - msg.good(f"Generated output file ({len(data)} documents): {output_file}") + msg.good(f"Generated output file ({len(docs)} documents): {output_file}") def autodetect_ner_format(input_data): @@ -148,3 +114,102 @@ def autodetect_ner_format(input_data): if format_guesses["ner"] == 0 and format_guesses["iob"] > 0: return "iob" return None + + +def walk_directory(path): + if not path.is_dir(): + return [path] + paths = [path] + locs = [] + seen = set() + for path in paths: + if str(path) in seen: + continue + seen.add(str(path)) + if path.parts[-1].startswith("."): + continue + elif path.is_dir(): + paths.extend(path.iterdir()) + else: + locs.append(path) + return locs + + +def verify_cli_args( + msg, + input_path, + output_dir, + file_type, + n_sents, + seg_sents, + model, + morphology, + merge_subtokens, + converter, + ner_map, + lang +): + if converter == "ner" or converter == "iob": + input_data = input_path.open("r", encoding="utf-8").read() + converter_autodetect = autodetect_ner_format(input_data) + if converter_autodetect == "ner": + msg.info("Auto-detected token-per-line NER format") + converter = converter_autodetect + elif converter_autodetect == "iob": + msg.info("Auto-detected sentence-per-line NER format") + converter = converter_autodetect + else: + msg.warn( + "Can't automatically detect NER format. Conversion may not", + "succeed. See https://spacy.io/api/cli#convert" + ) + if file_type not in FILE_TYPES_STDOUT and output_dir == "-": + # TODO: support msgpack via stdout in srsly? + msg.fail( + f"Can't write .{file_type} data to stdout", + "Please specify an output directory.", + exits=1, + ) + if not input_path.exists(): + msg.fail("Input file not found", input_path, exits=1) + if output_dir != "-" and not Path(output_dir).exists(): + msg.fail("Output directory not found", output_dir, exits=1) + if input_path.is_dir(): + input_locs = walk_directory(input_path) + if len(input_locs) == 0: + msg.fail("No input files in directory", input_path, exits=1) + file_types = list(set([loc.suffix[1:] for loc in input_locs])) + if len(file_types) >= 2: + file_types = ",".join(file_types) + msg.fail("All input files must be same type", file_types, exits=1) + if converter == "auto": + converter = file_types[0] + else: + converter = input_path.suffix[1:] + if converter not in CONVERTERS: + msg.fail(f"Can't find converter for {converter}", exits=1) + return converter + + +def _get_converter(msg, converter, input_path): + if input_path.is_dir(): + input_path = walk_directory(input_path)[0] + if converter == "auto": + converter = input_path.suffix[1:] + if converter == "ner" or converter == "iob": + with input_path.open() as file_: + input_data = file_.read() + converter_autodetect = autodetect_ner_format(input_data) + if converter_autodetect == "ner": + msg.info("Auto-detected token-per-line NER format") + converter = converter_autodetect + elif converter_autodetect == "iob": + msg.info("Auto-detected sentence-per-line NER format") + converter = converter_autodetect + else: + msg.warn( + "Can't automatically detect NER format. " + "Conversion may not succeed. " + "See https://spacy.io/api/cli#convert" + ) + return converter From 476bcd4c5312ded0dfe06fe0d69687201c318124 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 15:55:57 +0200 Subject: [PATCH 13/49] Fix import --- spacy/gold/converters/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index 6ccc0f8f5..c1b4b1566 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,6 +1,7 @@ from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 from .jsonl2docs import ner_jsonl2docs # noqa: F401 +from .json2docs import json2docs # TODO: Update this one #from .conllu2docs import conllu2docs # noqa: F401 From 91fa2f112671e728706bb009cc0ceb9faaa06d96 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 15:56:05 +0200 Subject: [PATCH 14/49] Fix docbin --- spacy/tokens/_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 7bf3faab3..8f3e942e3 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -60,7 +60,7 @@ class DocBin(object): self.strings = set() self.store_user_data = store_user_data for doc in docs: - self.add(docs) + self.add(doc) def __len__(self): """RETURNS: The number of Doc objects added to the DocBin.""" From b7a366b435328b7f0e87cbfb11d5780a12980cd8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 15:56:16 +0200 Subject: [PATCH 15/49] Fix compile in ArcEager --- spacy/syntax/arc_eager.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 1512955a5..0dfcbf885 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -513,7 +513,6 @@ cdef class ArcEager(TransitionSystem): keeps = [i for i, s in enumerate(states) if not s.is_final()] states = [states[i] for i in keeps] golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps] - cdef StateClass s n_steps = sum([len(s.queue) * 4 for s in states]) return states, golds, n_steps From 3241acbe0b8a60c4cddd57f5f19bae20a19a31c3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 15:56:28 +0200 Subject: [PATCH 16/49] Fix import --- spacy/gold/converters/iob2docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py index 2addc1af4..7901569fa 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/gold/converters/iob2docs.py @@ -3,7 +3,7 @@ from wasabi import Printer from ...gold import iob_to_biluo, tags_to_entities from ...util import minibatch from .util import merge_sentences -from .conll_ner2json import n_sents_info +from .conll_ner2docs import n_sents_info def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): From f5780cb160d1787d900bc1ca5f8795958a0474fb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 15:59:39 +0200 Subject: [PATCH 17/49] Serialize all attrs by default --- spacy/tokens/_serialize.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 8f3e942e3..3072787ae 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -9,6 +9,19 @@ from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors +ALL_ATTRS = ( + "ORTH", + "TAG", + "HEAD", + "DEP", + "SENT_START", + "ENT_IOB", + "ENT_TYPE", + "LEMMA", + "MORPH" +) + + class DocBin(object): """Pack Doc objects for binary serialization. @@ -39,7 +52,7 @@ class DocBin(object): document from the DocBin. """ - def __init__(self, attrs=None, store_user_data=False, docs=[]): + def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]): """Create a DocBin object to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are @@ -49,7 +62,6 @@ class DocBin(object): DOCS: https://spacy.io/api/docbin#init """ - attrs = attrs or [] attrs = sorted([intify_attr(attr) for attr in attrs]) self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] From 5d89b1840ec9c3556d55bfcfedbf77bfe4ebb249 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 16:00:14 +0200 Subject: [PATCH 18/49] Update converter --- spacy/cli/convert.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 4cf960379..3b3aa0b91 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -25,17 +25,6 @@ CONVERTERS = { } -ALL_ATTRS = [ - "ORTH", - "TAG", - "HEAD", - "DEP", - "SENT_START", - "ENT_IOB", - "ENT_TYPE", - "LEMMA", - "MORPH", -] # File types FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES_STDOUT = ("json", "jsonl") @@ -91,7 +80,7 @@ def convert( data = docs2json(docs) srsly.write_json(output_file, docs2json(docs)) else: - data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes() + data = DocBin(docs=docs).to_bytes() with output_file.open("wb") as file_: file_.write(data) msg.good(f"Generated output file ({len(docs)} documents): {output_file}") From f1756a6a222c99939d9433c64574a648df701edb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 16:02:40 +0200 Subject: [PATCH 19/49] Remove jsonl converter --- spacy/cli/convert.py | 2 -- spacy/gold/converters/__init__.py | 1 - spacy/gold/converters/jsonl2docs.py | 51 ----------------------------- 3 files changed, 54 deletions(-) delete mode 100644 spacy/gold/converters/jsonl2docs.py diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 3b3aa0b91..f4bddac39 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -6,7 +6,6 @@ import sys from ..tokens import DocBin from ..gold.converters import iob2docs, conll_ner2docs, json2docs -from ..gold.converters import ner_jsonl2docs # Converters are matched by file extension except for ner/iob, which are @@ -20,7 +19,6 @@ CONVERTERS = { #"conll": conllu2docs, TODO "ner": conll_ner2docs, "iob": iob2docs, - "jsonl": ner_jsonl2docs, "json": json2docs, } diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index c1b4b1566..0a1242fb4 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,6 +1,5 @@ from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 -from .jsonl2docs import ner_jsonl2docs # noqa: F401 from .json2docs import json2docs # TODO: Update this one diff --git a/spacy/gold/converters/jsonl2docs.py b/spacy/gold/converters/jsonl2docs.py deleted file mode 100644 index 8639a11b9..000000000 --- a/spacy/gold/converters/jsonl2docs.py +++ /dev/null @@ -1,51 +0,0 @@ -import srsly - -from ...gold import docs_to_json -from ...util import get_lang_class, minibatch - - -def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_): - if lang is None: - raise ValueError("No --lang specified, but tokenization required") - docs = [] - input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] - nlp = get_lang_class(lang)() - sentencizer = nlp.create_pipe("sentencizer") - for i, batch in enumerate(minibatch(input_examples, size=n_sents)): - docs = [] - # TODO: Should we be merging these? We're disrespecting the n_sents - # currently. - for record in batch: - raw_text = record["text"] - if "entities" in record: - ents = record["entities"] - else: - ents = record["spans"] - ents = [(e["start"], e["end"], e["label"]) for e in ents] - doc = nlp.make_doc(raw_text) - sentencizer(doc) - spans = [doc.char_span(s, e, label=L) for s, e, L in ents] - doc.ents = _cleanup_spans(spans) - docs.append(doc) - return docs - - -def _cleanup_spans(spans): - output = [] - seen = set() - for span in spans: - if span is not None: - # Trim whitespace - while len(span) and span[0].is_space: - span = span[1:] - while len(span) and span[-1].is_space: - span = span[:-1] - if not len(span): - continue - for i in range(span.start, span.end): - if i in seen: - break - else: - output.append(span) - seen.update(range(span.start, span.end)) - return output From 7360d3db72e6663dd56c02f3dcdbf3874ebdc872 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 16:02:53 +0200 Subject: [PATCH 20/49] Add json2docs converter --- spacy/gold/converters/json2docs.py | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 spacy/gold/converters/json2docs.py diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py new file mode 100644 index 000000000..98219bb04 --- /dev/null +++ b/spacy/gold/converters/json2docs.py @@ -0,0 +1,38 @@ +import tempfile +import contextlib +import shutil +from pathlib import Path +from ..gold_io import read_json_file +from ..example import annotations2doc +from ..example import _fix_legacy_dict_data, _parse_example_dict_data +from ...util import load_model +from ...lang.xx import MultiLanguage + +@contextlib.contextmanager +def make_tempdir(): + d = Path(tempfile.mkdtemp()) + yield d + shutil.rmtree(str(d)) + + +def json2docs( + input_data, + model=None, + **kwargs +): + nlp = load_model(model) if model is not None else MultiLanguage() + docs = [] + with make_tempdir() as tmp_dir: + json_path = Path(tmp_dir) / "data.json" + with (json_path).open("w") as file_: + file_.write(input_data) + for json_annot in read_json_file(json_path): + example_dict = _fix_legacy_dict_data(json_annot) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + doc = annotations2doc( + nlp.vocab, + tok_dict, + doc_dict + ) + docs.append(doc) + return docs From 0de361cd00f7a841b112457f07800a110073bf77 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 18:31:07 +0200 Subject: [PATCH 21/49] Draft Corpus class for DocBin --- spacy/gold/corpus_docbin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py index 3ebaa7376..a9562944c 100644 --- a/spacy/gold/corpus_docbin.py +++ b/spacy/gold/corpus_docbin.py @@ -5,7 +5,7 @@ from .example import Example from ..tokens import DocBin -class GoldCorpus(object): +class Corpus: """An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER. @@ -38,7 +38,7 @@ class GoldCorpus(object): continue elif path.is_dir(): paths.extend(path.iterdir()) - elif path.parts[-1].endswith(".spacy") + elif path.parts[-1].endswith(".spacy"): locs.append(path) return locs From 11fa0658f739b31effadfef5c2f277674fc1a7b8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 20:12:19 +0200 Subject: [PATCH 22/49] Work on train script --- spacy/cli/train.py | 151 ++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 85 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index fb4347158..64eb89d13 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -12,7 +12,7 @@ import thinc.schedules from thinc.api import Model, use_pytorch_for_gpu_memory import random -from ..gold import GoldCorpus +from ..gold.corpus_docbin import Corpus from ..lookups import Lookups from .. import util from ..errors import Errors @@ -148,26 +148,8 @@ def train_cli( command. """ util.set_env_log(verbose) + verify_cli_args(**locals()) - # Make sure all files and paths exists if they are needed - if not config_path or not config_path.exists(): - msg.fail("Config file not found", config_path, exits=1) - if not train_path or not train_path.exists(): - msg.fail("Training data not found", train_path, exits=1) - if not dev_path or not dev_path.exists(): - msg.fail("Development data not found", dev_path, exits=1) - if output_path is not None: - if not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") - elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty.", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) tag_map = {} @@ -176,9 +158,7 @@ def train_cli( weights_data = None if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: + with init_tok2vec.open("rb") as file_: weights_data = file_.read() if use_gpu >= 0: @@ -198,6 +178,7 @@ def train_cli( ) + def train( config_path, data_paths, @@ -221,60 +202,9 @@ def train( nlp = util.load_model_from_config(nlp_config) optimizer = training["optimizer"] limit = training["limit"] - msg.info("Loading training corpus") - corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) - # verify textcat config + corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit) if "textcat" in nlp_config["pipeline"]: - textcat_labels = set(nlp.get_pipe("textcat").labels) - textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] - - # check whether the setting 'exclusive_classes' corresponds to the provided training data - if textcat_multilabel: - multilabel_found = False - for eg in corpus.train_annotations: - cats = eg.reference.cats - textcat_labels.update(cats.keys()) - if list(cats.values()).count(1.0) != 1: - multilabel_found = True - if not multilabel_found: - msg.warn( - "The textcat training instances look like they have " - "mutually exclusive classes. Set 'exclusive_classes' " - "to 'true' in the config to train a classifier with " - "mutually exclusive classes more accurately." - ) - else: - for eg in corpus.train_annotations: - cats = eg.reference.cats - textcat_labels.update(cats.keys()) - if list(cats.values()).count(1.0) != 1: - msg.fail( - "Some textcat training instances do not have exactly " - "one positive label. Set 'exclusive_classes' " - "to 'false' in the config to train a classifier with classes " - "that are not mutually exclusive." - ) - msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") - nlp.get_pipe("textcat").labels = tuple(textcat_labels) - - # if 'positive_label' is provided: double check whether it's in the data and the task is binary - if nlp_config["pipeline"]["textcat"].get("positive_label", None): - textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) - pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] - if pos_label not in textcat_labels: - msg.fail( - f"The textcat's 'positive_label' config setting '{pos_label}' " - f"does not match any label in the training data.", - exits=1, - ) - if len(textcat_labels) != 2: - msg.fail( - f"A textcat 'positive_label' '{pos_label}' was " - f"provided for training data that does not appear to be a " - f"binary classification problem with two labels.", - exits=1, - ) - + verify_textcat_config(nlp, nlp_config) if training.get("resume", False): msg.info("Resuming training") nlp.resume_training() @@ -312,6 +242,7 @@ def train( ) tok2vec.from_bytes(weights_data) + msg.info("Loading training corpus") train_batches = create_train_batches(nlp, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) @@ -368,15 +299,7 @@ def train( def create_train_batches(nlp, corpus, cfg): epochs_todo = cfg.get("max_epochs", 0) while True: - train_examples = list( - corpus.train_dataset( - nlp, - orth_variant_level=cfg["orth_variant_level"], - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"], - ignore_misaligned=True, - ) - ) + train_examples = list(corpus.train_dataset(nlp)) if len(train_examples) == 0: raise ValueError(Errors.E988) @@ -598,3 +521,61 @@ def update_meta(training, nlp, info): nlp.meta["performance"][metric] = info["other_scores"][metric] for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + + +def verify_cli_args( + train_path, + dev_path, + config_path, + output_path=None, + init_tok2vec=None, + raw_text=None, + verbose=False, + use_gpu=-1, + tag_map_path=None, + omit_extra_lookups=False, +): + # Make sure all files and paths exists if they are needed + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) + if not train_path or not train_path.exists(): + msg.fail("Training data not found", train_path, exits=1) + if not dev_path or not dev_path.exists(): + msg.fail("Development data not found", dev_path, exits=1) + if output_path is not None: + if not output_path.exists(): + output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.warn( + "Output directory is not empty.", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) + if init_tok2vec is not None and not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + + +def verify_textcat_config(nlp, nlp_config): + msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") + nlp.get_pipe("textcat").labels = tuple(textcat_labels) + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if nlp_config["pipeline"]["textcat"].get("positive_label", None): + textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) + pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] + if pos_label not in textcat_labels: + msg.fail( + f"The textcat's 'positive_label' config setting '{pos_label}' " + f"does not match any label in the training data.", + exits=1, + ) + if len(textcat_labels) != 2: + msg.fail( + f"A textcat 'positive_label' '{pos_label}' was " + f"provided for training data that does not appear to be a " + f"binary classification problem with two labels.", + exits=1, + ) From 0a8b6631a26da1bf0959bd67d623b955e985dcec Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 20:12:31 +0200 Subject: [PATCH 23/49] Update Corpus --- spacy/gold/corpus_docbin.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py index a9562944c..8ee1e9a6c 100644 --- a/spacy/gold/corpus_docbin.py +++ b/spacy/gold/corpus_docbin.py @@ -1,5 +1,6 @@ import srsly from pathlib import Path +import random from .. import util from .example import Example from ..tokens import DocBin @@ -11,14 +12,13 @@ class Corpus: DOCS: https://spacy.io/api/goldcorpus """ - def __init__(self, vocab, train_loc, dev_loc, limit=0): + def __init__(self, train_loc, dev_loc, limit=0): """Create a GoldCorpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ - self.vocab = vocab self.train_loc = train_loc self.dev_loc = dev_loc @@ -42,7 +42,12 @@ class Corpus: locs.append(path) return locs - def read_docbin(self, locs, limit=0): + def make_examples(self, nlp, reference_docs, **kwargs): + for reference in reference_docs: + predicted = nlp.make_doc(reference.text) + yield Example(predicted, reference) + + def read_docbin(self, vocab, locs, limit=0): """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -50,31 +55,26 @@ class Corpus: if loc.parts[-1].endswith(".spacy"): with loc.open("rb") as file_: doc_bin = DocBin().from_bytes(file_.read()) - docs = list(doc_bin.get_docs(self.vocab)) - assert len(docs) % 2 == 0 - # Pair up the docs into the (predicted, reference) pairs. - for i in range(0, len(docs), 2): - predicted = docs[i] - reference = docs[i+1] - yield Example(predicted, reference) + yield from doc_bin.get_docs(vocab) - def count_train(self): + def count_train(self, nlp): """Returns count of words in train examples""" n = 0 i = 0 - for example in self.train_dataset(): + for example in self.train_dataset(nlp): n += len(example.predicted) if self.limit and i >= self.limit: break i += 1 return n - def train_dataset(self): - examples = self.read_docbin(self.walk_corpus(self.train_loc)) + def train_dataset(self, nlp, **kwargs): + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) + examples = list(self.make_examples(nlp, ref_docs, **kwargs)) random.shuffle(examples) yield from examples - def dev_dataset(self): - examples = self.read_docbin(self.walk_corpus(self.dev_loc)) - random.shuffle(examples) + def dev_dataset(self, nlp): + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) + examples = self.make_examples(nlp, ref_docs, **kwargs) yield from examples From 652f31d3ee1021f528b9b543de1d82b5c59b1262 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 20:12:54 +0200 Subject: [PATCH 24/49] Update DocBin --- spacy/tokens/_serialize.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 3072787ae..febfbd670 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -14,7 +14,6 @@ ALL_ATTRS = ( "TAG", "HEAD", "DEP", - "SENT_START", "ENT_IOB", "ENT_TYPE", "LEMMA", @@ -112,8 +111,7 @@ class DocBin(object): for i in range(len(self.tokens)): tokens = self.tokens[i] spaces = self.spaces[i] - words = [vocab.strings[orth] for orth in tokens[:, orth_col]] - doc = Doc(vocab, words=words, spaces=spaces) + doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) doc = doc.from_array(self.attrs, tokens) doc.cats = self.cats[i] if self.store_user_data: From fa86aa581d67900929d2bcbb09efa93eb5ea7abb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 20:15:21 +0200 Subject: [PATCH 25/49] Allocate Doc before starting to add words --- spacy/tokens/doc.pyx | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f9e7c97dd..686f3be54 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,6 +3,7 @@ cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt +from libc.stdint cimport int32_t, uint64_t from collections import Counter import numpy @@ -186,7 +187,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#init """ self.vocab = vocab - size = 20 + size = max(20, (len(words) if words is not None else 0)) self.mem = Pool() # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can @@ -211,7 +212,6 @@ cdef class Doc: self.user_data = {} if user_data is None else user_data self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) - cdef unicode orth cdef bint has_space if orths_and_spaces is None and words is not None: if spaces is None: @@ -219,19 +219,22 @@ cdef class Doc: elif len(spaces) != len(words): raise ValueError(Errors.E027) orths_and_spaces = zip(words, spaces) + cdef const LexemeC* lexeme if orths_and_spaces is not None: + orths_and_spaces = list(orths_and_spaces) for orth_space in orths_and_spaces: if isinstance(orth_space, unicode): - orth = orth_space + lexeme = self.vocab.get(self.mem, orth_space) has_space = True elif isinstance(orth_space, bytes): raise ValueError(Errors.E028.format(value=orth_space)) + elif isinstance(orth_space[0], unicode): + lexeme = self.vocab.get(self.mem, orth_space[0]) + has_space = orth_space[1] else: - orth, has_space = orth_space - # Note that we pass self.mem here --- we have ownership, if LexemeC - # must be created. - self.push_back( - self.vocab.get(self.mem, orth), has_space) + lexeme = self.vocab.get_by_orth(self.mem, orth_space[0]) + has_space = orth_space[1] + self.push_back(lexeme, has_space) # Tough to decide on policy for this. Is an empty doc tagged and parsed? # There's no information we'd like to add to it, so I guess so? if self.length == 0: @@ -753,6 +756,8 @@ cdef class Doc: return dict(counts) def _realloc(self, new_size): + if new_size < self.max_length: + return self.max_length = new_size n = new_size + (PADDING * 2) # What we're storing is a "padded" array. We've jumped forward PADDING From 6d821b2e5559151f28880da0ff4a90e391e87657 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 20:17:13 +0200 Subject: [PATCH 26/49] Make doc.from_array several times faster --- spacy/tokens/doc.pyx | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 686f3be54..72a16b854 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -806,12 +806,14 @@ cdef class Doc: if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) - cdef int i, col, abs_head_index + cdef int i, col + cdef int32_t abs_head_index cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) if length != len(self): raise ValueError("Cannot set array values longer than the document.") + # Get set up for fast loading cdef Pool mem = Pool() cdef int n_attrs = len(attrs) @@ -822,33 +824,52 @@ cdef class Doc: attr_ids[i] = attr_id if len(array.shape) == 1: array = array.reshape((array.size, 1)) + cdef np.ndarray transposed_array = numpy.ascontiguousarray(array.T) + values = transposed_array.data + stride = transposed_array.shape[1] # Check that all heads are within the document bounds if HEAD in attrs: col = attrs.index(HEAD) for i in range(length): # cast index to signed int - abs_head_index = numpy.int32(array[i, col]) + i + abs_head_index = values[col * stride + i] + abs_head_index += i if abs_head_index < 0 or abs_head_index >= length: - raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col]))) + raise ValueError( + Errors.E190.format( + index=i, + value=array[i, col], + rel_head_index=abs_head_index-i + ) + ) # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA if TAG in attrs: col = attrs.index(TAG) for i in range(length): - if array[i, col] != 0: - self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) + value = values[col * stride + i] + if value != 0: + self.vocab.morphology.assign_tag(&tokens[i], value) # Verify ENT_IOB are proper integers if ENT_IOB in attrs: iob_strings = Token.iob_strings() col = attrs.index(ENT_IOB) + n_iob_strings = len(iob_strings) for i in range(length): - if array[i, col] not in range(0, len(iob_strings)): - raise ValueError(Errors.E982.format(values=iob_strings, value=array[i, col])) + value = values[col * stride + i] + if value < 0 or value >= n_iob_strings: + raise ValueError( + Errors.E982.format( + values=iob_strings, + value=value + ) + ) # Now load the data for i in range(length): token = &self.c[i] for j in range(n_attrs): if attr_ids[j] != TAG: - Token.set_struct_attr(token, attr_ids[j], array[i, j]) + value = values[j * stride + i] + Token.set_struct_attr(token, attr_ids[j], value) # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) From 450c6fe39c6e3b32bc00cd20b844e37dd0adee5a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:49:06 +0200 Subject: [PATCH 27/49] Update train.py --- spacy/cli/train.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 64eb89d13..3420c96fa 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -210,7 +210,8 @@ def train( nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - nlp.begin_training(lambda: corpus.train_dataset(nlp)) + train_examples = list(corpus.train_dataset(nlp, shuffle=False)) + nlp.begin_training(lambda: train_examples) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) @@ -280,11 +281,14 @@ def train( eg.reference = None eg.predicted = None except Exception as e: - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}", - exits=1, - ) + if output_path is not None: + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}", + exits=1, + ) + else: + raise e finally: if output_path is not None: final_model_path = output_path / "model-final" @@ -300,7 +304,6 @@ def create_train_batches(nlp, corpus, cfg): epochs_todo = cfg.get("max_epochs", 0) while True: train_examples = list(corpus.train_dataset(nlp)) - if len(train_examples) == 0: raise ValueError(Errors.E988) random.shuffle(train_examples) From 396dd60b3a9ef62f27bf406aff82b167ed8c63a3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:49:15 +0200 Subject: [PATCH 28/49] Fix Corpus --- spacy/gold/corpus_docbin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py index 8ee1e9a6c..750217c8c 100644 --- a/spacy/gold/corpus_docbin.py +++ b/spacy/gold/corpus_docbin.py @@ -68,10 +68,12 @@ class Corpus: i += 1 return n - def train_dataset(self, nlp, **kwargs): + def train_dataset(self, nlp, shuffle=True, **kwargs): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - examples = list(self.make_examples(nlp, ref_docs, **kwargs)) - random.shuffle(examples) + examples = self.make_examples(nlp, ref_docs, **kwargs) + if shuffle: + examples = list(examples) + random.shuffle(examples) yield from examples def dev_dataset(self, nlp): From 2bcb5881d70d550afb09fe7dcc9cda8e260ca53a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:49:31 +0200 Subject: [PATCH 29/49] Fix parser model --- spacy/syntax/_parser_model.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index eef5723f3..d3093d60d 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -261,7 +261,7 @@ class ParserStepModel(Model): def get_token_ids(self, states): cdef StateClass state - states = [state for state in states() if not state.is_final()] + states = [state for state in states if not state.is_final()] cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF), dtype='i', order='C') ids.fill(-1) From 0c10831b14edddd9c6491c0edfd7ab81bcdc7e98 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:49:46 +0200 Subject: [PATCH 30/49] Start debugging arc_eager oracle --- spacy/syntax/arc_eager.pyx | 48 +++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 0dfcbf885..b0fedd6c4 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -76,18 +76,27 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp gs.n_kids_in_stack = mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0])) cand_to_gold = example.alignment.cand_to_gold + gold_to_cand = example.alignment.cand_to_gold cdef TokenC ref_tok for cand_i in range(example.x.length): gold_i = cand_to_gold[cand_i] - if cand_i is not None: # Alignment found + if gold_i is not None: # Alignment found ref_tok = example.y.c[gold_i] - gs.heads[cand_i] = ref_tok.head - gs.labels[cand_i] = ref_tok.dep - gs.state_bits[cand_i] = set_state_flag( - gs.state_bits[cand_i], - HEAD_UNKNOWN, - 0 - ) + gold_head = gold_to_cand[ref_tok.head + gold_i] + if gold_head is not None: + gs.heads[cand_i] = gold_head + gs.labels[cand_i] = ref_tok.dep + gs.state_bits[cand_i] = set_state_flag( + gs.state_bits[cand_i], + HEAD_UNKNOWN, + 0 + ) + else: + gs.state_bits[cand_i] = set_state_flag( + gs.state_bits[cand_i], + HEAD_UNKNOWN, + 1 + ) else: gs.state_bits[cand_i] = set_state_flag( gs.state_bits[cand_i], @@ -135,6 +144,8 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp cdef class ArcEagerGold: cdef GoldParseStateC c + cdef Pool mem + def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() self.c = create_gold_state(self.mem, stcls, example) @@ -610,9 +621,8 @@ cdef class ArcEager(TransitionSystem): output[i] = is_valid[self.c[i].move] cdef int set_costs(self, int* is_valid, weight_t* costs, - StateClass stcls, Example example) except -1: - cdef Pool mem = Pool() - gold_state = create_gold_state(mem, stcls, example) + StateClass stcls, gold) except -1: + gold_state = (gold).c cdef int i, move cdef attr_t label cdef label_cost_func_t[N_MOVES] label_cost_funcs @@ -643,16 +653,16 @@ cdef class ArcEager(TransitionSystem): label = self.c[i].label if move_costs[move] == 9000: move_costs[move] = move_cost_funcs[move](stcls, &gold_state) - costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold_state, label) + move_cost = move_costs[move] + label_cost = label_cost_funcs[move](stcls, &gold_state, label) + costs[i] = move_cost + label_cost n_gold += costs[i] <= 0 + print(move, label, costs[i]) else: is_valid[i] = False costs[i] = 9000 if n_gold < 1: - # Check projectivity --- leading cause - if is_nonproj_tree(example.get_field("HEAD")): - raise ValueError(Errors.E020) - else: - failure_state = stcls.print_state([t.text for t in example]) - raise ValueError(Errors.E021.format(n_actions=self.n_moves, - state=failure_state)) + raise ValueError + #failure_state = stcls.print_state([t.text for t in example]) + #raise ValueError( + # Errors.E021.format(n_actions=self.n_moves, state=failure_state)) From 52edb24f075de6c413e752a7cea712817c2b730a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:50:06 +0200 Subject: [PATCH 31/49] Update header --- spacy/syntax/transition_system.pxd | 2 +- spacy/syntax/transition_system.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 21752b15f..836c08168 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -52,4 +52,4 @@ cdef class TransitionSystem: cdef int set_valid(self, int* output, const StateC* st) nogil cdef int set_costs(self, int* is_valid, weight_t* costs, - StateClass state, Example example) except -1 + StateClass state, gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 687c234d0..319550161 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -115,7 +115,7 @@ cdef class TransitionSystem: is_valid[i] = self.c[i].is_valid(st, self.c[i].label) cdef int set_costs(self, int* is_valid, weight_t* costs, - StateClass stcls, Example example) except -1: + StateClass stcls, gold) except -1: raise NotImplementedError def get_class_name(self, int clas): From 6af99f2f2d156181fe7b210ffe74fc9679e436be Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:50:17 +0200 Subject: [PATCH 32/49] Fix parser declaration --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 22e0e7995..f36b10bcc 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -273,6 +273,7 @@ cdef class Parser: [eg.predicted for eg in examples]) states, golds, max_steps = self.moves.init_gold_batch(examples) all_states = list(states) + states_golds = zip(states, golds) for _ in range(max_steps): if not states_golds: break @@ -353,7 +354,6 @@ cdef class Parser: def get_batch_loss(self, states, golds, float[:, ::1] scores, losses): cdef StateClass state - cdef Example example cdef Pool mem = Pool() cdef int i From 0b23fd3891e14ff8d6d0f071ca3dc1d0a50a47e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 21:52:57 +0200 Subject: [PATCH 33/49] Xfail some tests --- spacy/tests/test_cli.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 132f7ac9f..4b244a3ce 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,10 +1,13 @@ import pytest from spacy.lang.en import English -from spacy.cli.converters import conllu2json, iob2json, conll_ner2json +from spacy.gold.converters import iob2docs, conll_ner2docs from spacy.cli.pretrain import make_docs +# TODO +# from spacy.gold.converters import conllu2docs +@pytest.mark.xfail def test_cli_converters_conllu2json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ @@ -29,6 +32,7 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] +@pytest.mark.xfail @pytest.mark.parametrize( "lines", [ @@ -66,6 +70,7 @@ def test_cli_converters_conllu2json_name_ner_map(lines): assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] +@pytest.mark.xfail def test_cli_converters_conllu2json_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ @@ -109,6 +114,7 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] +@pytest.mark.xfail def test_cli_converters_iob2json(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -132,6 +138,7 @@ def test_cli_converters_iob2json(): # fmt: on +@pytest.mark.xfail def test_cli_converters_conll_ner2json(): lines = [ "-DOCSTART- -X- O O", From 095710e40e96c06996bc2798b9d0a1cfba09f979 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:02:32 +0200 Subject: [PATCH 34/49] Skip tests that cause crashes --- spacy/tests/parser/test_add_label.py | 7 ++++++- spacy/tests/parser/test_parse.py | 7 +++++-- spacy/tests/parser/test_preset_sbd.py | 4 ++++ spacy/tests/regression/test_issue4665.py | 5 ++++- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 7d8063242..093d4e266 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -44,6 +44,8 @@ def _train_parser(parser): return parser +# Segfaulting due to refactor. Need to fix. +@pytest.mark.skip def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") @@ -62,6 +64,8 @@ def test_add_label(parser): assert doc[2].dep_ == "left" +# segfaulting due to refactor. need to fix. +@pytest.mark.skip def test_add_label_deserializes_correctly(): config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} ner1 = EntityRecognizer(Vocab(), default_ner(), **config) @@ -78,7 +82,8 @@ def test_add_label_deserializes_correctly(): for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i) - +# segfaulting due to refactor. need to fix. +@pytest.mark.skip @pytest.mark.parametrize( "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())], diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 6e13d3044..ab9228533 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -46,7 +46,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): assert doc[0].dep != 0 -@pytest.mark.xfail +@pytest.mark.skip # Segfault def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." # heads = [1, 0, 1, -2, -3, -1, -5] @@ -59,6 +59,7 @@ def test_parser_initial(en_tokenizer, en_parser): assert tokens[3].head.i == 3 +@pytest.mark.skip # Segfault def test_parser_parse_subtrees(en_tokenizer, en_parser): text = "The four wheels on the bus turned quickly" heads = [2, 1, 4, -1, 1, -2, 0, -1] @@ -73,6 +74,7 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser): assert len(list(doc[2].subtree)) == 6 +@pytest.mark.skip # Segfault def test_parser_merge_pp(en_tokenizer): text = "A phrase with another phrase occurs" heads = [1, 4, -1, 1, -2, 0] @@ -91,7 +93,7 @@ def test_parser_merge_pp(en_tokenizer): assert doc[3].text == "occurs" -@pytest.mark.xfail +@pytest.mark.skip # Segfault def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): text = "a b c d e" @@ -166,6 +168,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): assert tokens[4].head.i == 4 +@pytest.mark.skip # Segfault def test_parser_set_sent_starts(en_vocab): # fmt: off words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 5a29d84f4..9a2e1cfe8 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -33,12 +33,14 @@ def parser(vocab): return parser +@pytest.mark.skip # Segfaults def test_no_sentences(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert len(list(doc.sents)) >= 1 +@pytest.mark.skip # Segfaults def test_sents_1(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[2].sent_start = True @@ -52,6 +54,7 @@ def test_sents_1(parser): assert len(list(doc.sents)) == 2 +@pytest.mark.skip # Segfaults def test_sents_1_2(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True @@ -60,6 +63,7 @@ def test_sents_1_2(parser): assert len(list(doc.sents)) >= 3 +@pytest.mark.skip # Segfaults def test_sents_1_3(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py index 721ec0098..cb9279250 100644 --- a/spacy/tests/regression/test_issue4665.py +++ b/spacy/tests/regression/test_issue4665.py @@ -1,4 +1,6 @@ -from spacy.cli.converters.conllu2json import conllu2json +import pytest +# TODO +#from spacy.gold.converters.conllu2docs import conllu2docs input_data = """ 1 [ _ PUNCT -LRB- _ _ punct _ _ @@ -22,6 +24,7 @@ input_data = """ """ +@pytest.mark.xfail def test_issue4665(): """ conllu2json should not raise an exception if the HEAD column contains an From fd83551eb592b39fe97c0abee68d7e4b51dd53d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:11:27 +0200 Subject: [PATCH 35/49] Skip test causing segfault --- spacy/tests/parser/test_parse.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index ab9228533..80d91e7ae 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -22,6 +22,7 @@ TRAIN_DATA = [ ] +@pytest.mark.skip # Segfault def test_parser_root(en_tokenizer): text = "i don't have other assistance" heads = [3, 2, 1, 0, 1, -2] @@ -32,8 +33,9 @@ def test_parser_root(en_tokenizer): assert t.dep != 0, t.text -@pytest.mark.xfail -@pytest.mark.parametrize("text", ["Hello"]) +#@pytest.mark.xfail +#@pytest.mark.parametrize("text", ["Hello"]) +@pytest.mark.skip # Segfault def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): tokens = en_tokenizer(text) doc = get_doc( @@ -185,7 +187,7 @@ def test_parser_set_sent_starts(en_vocab): for token in sent: assert token.head in sent - +@pytest.mark.skip def test_overfitting_IO(): # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly nlp = English() From cfd024536db3a81592aac2343071c5272b62907d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:13:37 +0200 Subject: [PATCH 36/49] Remove GoldCorpus --- spacy/gold/corpus.py | 222 ------------------------------------------- 1 file changed, 222 deletions(-) delete mode 100644 spacy/gold/corpus.py diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py deleted file mode 100644 index c84f8355f..000000000 --- a/spacy/gold/corpus.py +++ /dev/null @@ -1,222 +0,0 @@ -import random -import shutil -import tempfile -import srsly -from pathlib import Path -import itertools -from ..tokens import Doc -from .. import util -from ..errors import Errors, AlignmentError -from .gold_io import read_json_file, json_to_annotations -from .augment import make_orth_variants -from .example import Example - - -class GoldCorpus(object): - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. - - DOCS: https://spacy.io/api/goldcorpus - """ - - def __init__(self, train, dev, gold_preproc=False, limit=None): - """Create a GoldCorpus. - - train (str / Path): File or directory of training data. - dev (str / Path): File or directory of development data. - RETURNS (GoldCorpus): The newly created object. - """ - self.limit = limit - if isinstance(train, str) or isinstance(train, Path): - train = self.read_annotations(self.walk_corpus(train)) - dev = self.read_annotations(self.walk_corpus(dev)) - # Write temp directory with one doc per file, so we can shuffle and stream - self.tmp_dir = Path(tempfile.mkdtemp()) - self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) - self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) - - def __del__(self): - shutil.rmtree(self.tmp_dir) - - @staticmethod - def write_msgpack(directory, examples, limit=0): - if not directory.exists(): - directory.mkdir() - n = 0 - for i, ex_dict in enumerate(examples): - text = ex_dict["text"] - srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) - n += 1 - if limit and n >= limit: - break - - @staticmethod - def walk_corpus(path): - path = util.ensure_path(path) - if not path.is_dir(): - return [path] - paths = [path] - locs = [] - seen = set() - for path in paths: - if str(path) in seen: - continue - seen.add(str(path)) - if path.parts[-1].startswith("."): - continue - elif path.is_dir(): - paths.extend(path.iterdir()) - elif path.parts[-1].endswith((".json", ".jsonl")): - locs.append(path) - return locs - - @staticmethod - def read_annotations(locs, limit=0): - """ Yield training examples as example dicts """ - i = 0 - for loc in locs: - loc = util.ensure_path(loc) - file_name = loc.parts[-1] - if file_name.endswith("json"): - examples = read_json_file(loc) - elif file_name.endswith("jsonl"): - gold_tuples = srsly.read_jsonl(loc) - first_gold_tuple = next(gold_tuples) - gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) - # TODO: proper format checks with schemas - if isinstance(first_gold_tuple, dict): - if first_gold_tuple.get("paragraphs", None): - examples = [] - for json_doc in gold_tuples: - examples.extend(json_to_annotations(json_doc)) - elif first_gold_tuple.get("doc_annotation", None): - examples = [] - for ex_dict in gold_tuples: - doc = ex_dict.get("doc", None) - if doc is None: - doc = ex_dict.get("text", None) - if not ( - doc is None - or isinstance(doc, Doc) - or isinstance(doc, str) - ): - raise ValueError(Errors.E987.format(type=type(doc))) - examples.append(ex_dict) - - elif file_name.endswith("msg"): - text, ex_dict = srsly.read_msgpack(loc) - examples = [ex_dict] - else: - supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=loc, formats=supported)) - try: - for example in examples: - yield example - i += 1 - if limit and i >= limit: - return - except KeyError as e: - msg = "Missing key {}".format(e) - raise KeyError(Errors.E996.format(file=file_name, msg=msg)) - except UnboundLocalError as e: - msg = "Unexpected document structure" - raise ValueError(Errors.E996.format(file=file_name, msg=msg)) - - @property - def dev_annotations(self): - locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_annotations(locs, limit=self.limit) - - @property - def train_annotations(self): - locs = (self.tmp_dir / "train").iterdir() - yield from self.read_annotations(locs, limit=self.limit) - - def count_train(self): - """Returns count of words in train examples""" - n = 0 - i = 0 - for eg_dict in self.train_annotations: - n += len(eg_dict["token_annotation"]["words"]) - if self.limit and i >= self.limit: - break - i += 1 - return n - - def train_dataset( - self, - nlp, - gold_preproc=False, - max_length=None, - orth_variant_level=0.0, - ignore_misaligned=False, - ): - locs = list((self.tmp_dir / "train").iterdir()) - random.shuffle(locs) - train_annotations = self.read_annotations(locs, limit=self.limit) - examples = self.iter_examples( - nlp, - train_annotations, - gold_preproc, - max_length=max_length, - orth_variant_level=orth_variant_level, - make_projective=True, - ignore_misaligned=ignore_misaligned, - ) - yield from examples - - def train_dataset_without_preprocessing( - self, nlp, gold_preproc=False, ignore_misaligned=False - ): - examples = self.iter_examples( - nlp, - self.train_annotations, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned, - ) - yield from examples - - def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_examples( - nlp, - self.dev_annotations, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned, - ) - yield from examples - - @classmethod - def iter_examples( - cls, - nlp, - annotations, - gold_preproc, - max_length=None, - orth_variant_level=0.0, - make_projective=False, - ignore_misaligned=False, - ): - """ Setting gold_preproc will result in creating a doc per sentence """ - for eg_dict in annotations: - token_annot = eg_dict.get("token_annotation", {}) - if eg_dict["text"]: - doc = nlp.make_doc(eg_dict["text"]) - elif "words" in token_annot: - doc = Doc(nlp.vocab, words=token_annot["words"]) - else: - raise ValueError("Expecting either 'text' or token_annotation.words annotation") - - if gold_preproc: - variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level) - doc = nlp.make_doc(variant_text) - eg_dict["token_annotation"] = variant_token_annot - example = Example.from_dict(doc, eg_dict) - examples = example.split_sents() - - else: - example = Example.from_dict(doc, eg_dict) - examples = [example] - - for eg in examples: - if (not max_length) or len(eg.predicted) < max_length: - yield eg From 64d00520e2ee45a8b11446c7df9edd9046dd544a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:21:08 +0200 Subject: [PATCH 37/49] Update imports --- spacy/cli/debug_data.py | 4 ++-- spacy/cli/evaluate.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c86408170..e0a6cba2e 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -4,7 +4,7 @@ import sys import srsly from wasabi import Printer, MESSAGES -from ..gold import GoldCorpus +from ..gold import Corpus from ..syntax import nonproj from ..util import load_model, get_lang_class @@ -68,7 +68,7 @@ def debug_data( loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): - corpus = GoldCorpus(train_path, dev_path) + corpus = Corpus(train_path, dev_path) try: train_dataset = list(corpus.train_dataset(nlp)) train_dataset_unpreprocessed = list( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index bae252b1c..09ce7c1b5 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,7 +1,7 @@ from timeit import default_timer as timer from wasabi import msg -from ..gold import GoldCorpus +from ..gold import Corpus from .. import util from .. import displacy @@ -31,7 +31,7 @@ def evaluate( msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) - corpus = GoldCorpus(data_path, data_path) + corpus = Corpus(data_path, data_path) if model.startswith("blank:"): nlp = util.get_lang_class(model.replace("blank:", ""))() else: From 4bbc2777584808da383e4b79b98e174fed6563a0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:21:24 +0200 Subject: [PATCH 38/49] Update after removing GoldCorpus --- spacy/about.py | 2 +- spacy/gold/__init__.py | 2 +- spacy/tests/regression/test_issue4402.py | 4 ++-- spacy/tests/test_gold.py | 12 ++++++------ 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 04a660ad1..14ea60c8c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev9" +__version__ = "3.0.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index 22530a757..c2d237f84 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -1,4 +1,4 @@ -from .corpus import GoldCorpus +from .corpus_docbin import Corpus from .example import Example from .align import align diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 80d37b1e6..71ed7ec14 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,5 +1,5 @@ import srsly -from spacy.gold import GoldCorpus +from spacy.gold import Corpus from spacy.lang.en import English from ..util import make_tempdir @@ -11,7 +11,7 @@ def test_issue4402(): json_path = tmpdir / "test4402.json" srsly.write_json(json_path, json_data) - corpus = GoldCorpus(str(json_path), str(json_path)) + corpus = Corpus(str(json_path), str(json_path)) train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) # assert that the data got split into 4 sentences diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 726492138..7af62accb 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,7 +1,7 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align -from spacy.gold import GoldCorpus, docs_to_json +from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree @@ -299,7 +299,7 @@ def test_roundtrip_docs_to_json(doc): with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) + goldcorpus = Corpus(train=str(json_file), dev=str(json_file)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) assert len(doc) == goldcorpus.count_train() @@ -328,7 +328,7 @@ def test_projective_train_vs_nonprojective_dev(doc): json_file = tmpdir / "test.json" # write to JSON train dicts srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) train_reloaded_example = next(goldcorpus.train_dataset(nlp)) train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] @@ -360,7 +360,7 @@ def test_ignore_misaligned(doc): data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") # write to JSON train dicts srsly.write_json(json_file, data) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) with pytest.raises(AlignmentError): train_reloaded_example = next(goldcorpus.train_dataset(nlp)) @@ -371,7 +371,7 @@ def test_ignore_misaligned(doc): data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") # write to JSON train dicts srsly.write_json(json_file, data) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) # doesn't raise an AlignmentError, but there is nothing to iterate over # because the only example can't be aligned @@ -385,7 +385,7 @@ def test_make_orth_variants(doc): json_file = tmpdir / "test.json" # write to JSON train dicts srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) # due to randomness, test only that this runs with no errors for now train_example = next(goldcorpus.train_dataset(nlp)) From 2791c1c0dc69eeb756d8c69b3c0ddafc288dc00c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:22:14 +0200 Subject: [PATCH 39/49] Fix module name of corpus --- spacy/gold/__init__.py | 2 +- spacy/gold/{corpus_docbin.py => corpus.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename spacy/gold/{corpus_docbin.py => corpus.py} (100%) diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index c2d237f84..9416bdd81 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -1,4 +1,4 @@ -from .corpus_docbin import Corpus +from .corpus import Corpus from .example import Example from .align import align diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus.py similarity index 100% rename from spacy/gold/corpus_docbin.py rename to spacy/gold/corpus.py From 914924a68b3dbd2698a2dc7176e7d6f5d8562422 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:22:40 +0200 Subject: [PATCH 40/49] Fix mimport --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3420c96fa..6a1d74934 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -12,7 +12,7 @@ import thinc.schedules from thinc.api import Model, use_pytorch_for_gpu_memory import random -from ..gold.corpus_docbin import Corpus +from ..gold import Corpus from ..lookups import Lookups from .. import util from ..errors import Errors From c58deb354632bfe417a1821c171ce1d6eeae77a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:01:09 +0200 Subject: [PATCH 41/49] Work on parser oracle --- spacy/syntax/arc_eager.pyx | 129 +++++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 47 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b0fedd6c4..b8baab49a 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -82,7 +82,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp gold_i = cand_to_gold[cand_i] if gold_i is not None: # Alignment found ref_tok = example.y.c[gold_i] - gold_head = gold_to_cand[ref_tok.head + gold_i] + gold_head = gold_to_cand[gold_i + ref_tok.head] if gold_head is not None: gs.heads[cand_i] = gold_head gs.labels[cand_i] = ref_tok.dep @@ -106,17 +106,17 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp stack_words = set() for i in range(stcls.stack_depth()): s_i = stcls.S(i) - head = s_i + gs.heads[s_i] + head = gs.heads[s_i] gs.n_kids_in_stack[head] += 1 stack_words.add(s_i) buffer_words = set() for i in range(stcls.buffer_length()): b_i = stcls.B(i) - head = b_i + gs.heads[b_i] + head = gs.heads[b_i] gs.n_kids_in_buffer[head] += 1 buffer_words.add(b_i) for i in range(gs.length): - head = i + gs.heads[i] + head = gs.heads[i] if head in stack_words: gs.state_bits[i] = set_state_flag( gs.state_bits[i], @@ -142,6 +142,58 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp return gs +cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) except *: + for i in range(gs.length): + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_BUFFER, + 0 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_STACK, + 0 + ) + gs.n_kids_in_stack[i] = 0 + gs.n_kids_in_buffer[i] = 0 + stack_words = set() + for i in range(stcls.stack_depth()): + s_i = stcls.S(i) + head = gs.heads[s_i] + gs.n_kids_in_stack[head] += 1 + stack_words.add(s_i) + buffer_words = set() + for i in range(stcls.buffer_length()): + b_i = stcls.B(i) + head = gs.heads[b_i] + gs.n_kids_in_buffer[head] += 1 + buffer_words.add(b_i) + for i in range(gs.length): + head = gs.heads[i] + if head in stack_words: + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_STACK, + 1 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_BUFFER, + 0 + ) + elif head in buffer_words: + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_STACK, + 0 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_BUFFER, + 1 + ) + + cdef class ArcEagerGold: cdef GoldParseStateC c cdef Pool mem @@ -150,6 +202,9 @@ cdef class ArcEagerGold: self.mem = Pool() self.c = create_gold_state(self.mem, stcls, example) + def update(self, StateClass stcls): + update_gold_state(&self.c, stcls) + cdef int check_state_gold(char state_bits, char flag) nogil: @@ -319,22 +374,27 @@ cdef class LeftArc: @staticmethod cdef inline weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: gold = _gold - return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) + return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) @staticmethod - cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil: - gold = _gold - if arc_is_gold(gold, s.S(0), s.B(0)): - return 0 - elif s.c.shifted[s.B(0)]: - return push_cost(s, gold, s.B(0)) + cdef inline weight_t move_cost(StateClass s, const GoldParseStateC* gold) nogil: + cdef weight_t cost = 0 + s0 = s.S(0) + b0 = s.B(0) + if arc_is_gold(gold, b0, s0): + # Have a negative cost if we 'recover' from the wrong dependency + return 0 if not s.has_head(s0) else -1 else: - return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) + # Account for deps we might lose between S0 and stack + if not s.has_head(s0): + cost += gold.n_kids_in_stack[s0] + if is_head_in_buffer(gold, s0): + cost += 1 + return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) @staticmethod - cdef weight_t label_cost(StateClass s, const void* _gold, attr_t label) nogil: - gold = _gold - return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) + cdef inline weight_t label_cost(StateClass s, const GoldParseStateC* gold, attr_t label) nogil: + return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) cdef class RightArc: @@ -622,42 +682,17 @@ cdef class ArcEager(TransitionSystem): cdef int set_costs(self, int* is_valid, weight_t* costs, StateClass stcls, gold) except -1: - gold_state = (gold).c - cdef int i, move - cdef attr_t label - cdef label_cost_func_t[N_MOVES] label_cost_funcs - cdef move_cost_func_t[N_MOVES] move_cost_funcs - cdef weight_t[N_MOVES] move_costs - for i in range(N_MOVES): - move_costs[i] = 9000 - move_cost_funcs[SHIFT] = Shift.move_cost - move_cost_funcs[REDUCE] = Reduce.move_cost - move_cost_funcs[LEFT] = LeftArc.move_cost - move_cost_funcs[RIGHT] = RightArc.move_cost - move_cost_funcs[BREAK] = Break.move_cost - - label_cost_funcs[SHIFT] = Shift.label_cost - label_cost_funcs[REDUCE] = Reduce.label_cost - label_cost_funcs[LEFT] = LeftArc.label_cost - label_cost_funcs[RIGHT] = RightArc.label_cost - label_cost_funcs[BREAK] = Break.label_cost - - cdef attr_t* labels = gold_state.labels - cdef int32_t* heads = gold_state.heads - + if not isinstance(gold, ArcEagerGold): + raise TypeError("Expected ArcEagerGold") + cdef ArcEagerGold gold_ = gold + gold_.update(stcls) + gold_state = gold_.c n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls.c, self.c[i].label): is_valid[i] = True - move = self.c[i].move - label = self.c[i].label - if move_costs[move] == 9000: - move_costs[move] = move_cost_funcs[move](stcls, &gold_state) - move_cost = move_costs[move] - label_cost = label_cost_funcs[move](stcls, &gold_state, label) - costs[i] = move_cost + label_cost - n_gold += costs[i] <= 0 - print(move, label, costs[i]) + costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) + n_gold += 1 else: is_valid[i] = False costs[i] = 9000 From e90341810c3dfac5d912f695fc9e235a6e119120 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:04:02 +0200 Subject: [PATCH 42/49] Update arc_eager oracle --- spacy/syntax/arc_eager.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b8baab49a..13879d898 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -238,7 +238,7 @@ cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil: cdef weight_t cost = 0 if is_head_in_stack(gold, target): cost += 1 - cost += gold.n_kids_in_buffer[target] + cost += gold.n_kids_in_stack[target] if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0: cost += 1 return cost From 318a046fb094d42e4490c05d8a723696f878c30b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:11:08 +0200 Subject: [PATCH 43/49] Restore ArcEager.get_cost function --- spacy/syntax/arc_eager.pyx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 13879d898..c7ecbceea 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -562,9 +562,6 @@ cdef class ArcEager(TransitionSystem): def action_types(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) - def get_cost(self, StateClass state, Example gold, action): - raise NotImplementedError - def transition(self, StateClass state, action): cdef Transition t = self.lookup_transition(action) t.do(state.c, t.label) @@ -679,6 +676,18 @@ cdef class ArcEager(TransitionSystem): output[i] = self.c[i].is_valid(st, self.c[i].label) else: output[i] = is_valid[self.c[i].move] + + def get_cost(self, StateClass stcls, gold, int i): + if not isinstance(gold, ArcEagerGold): + raise TypeError("Expected ArcEagerGold") + cdef ArcEagerGold gold_ = gold + gold_state = gold_.c + n_gold = 0 + if self.c[i].is_valid(stcls.c, self.c[i].label): + cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) + else: + cost = 9000 + return cost cdef int set_costs(self, int* is_valid, weight_t* costs, StateClass stcls, gold) except -1: From 7544c21f5bff440e60938a0d33c1d73a30b4918e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:12:05 +0200 Subject: [PATCH 44/49] Update transition system --- spacy/syntax/transition_system.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 319550161..46e438e4c 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True +from __future__ import print_function from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool @@ -67,11 +68,13 @@ cdef class TransitionSystem: costs = mem.alloc(self.n_moves, sizeof(float)) is_valid = mem.alloc(self.n_moves, sizeof(int)) - cdef StateClass state = StateClass(example.predicted, offset=0) - self.initialize_state(state.c) + cdef StateClass state + states, golds, n_steps = self.init_gold_batch([example]) + state = states[0] + gold = golds[0] history = [] while not state.is_final(): - self.set_costs(is_valid, costs, state, example) + self.set_costs(is_valid, costs, state, gold) for i in range(self.n_moves): if is_valid[i] and costs[i] <= 0: action = self.c[i] From 9db66ddd4867c0d5db0967193e7adb249460c31d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:12:28 +0200 Subject: [PATCH 45/49] Update test_arc_eager_oracle --- spacy/tests/parser/test_arc_eager_oracle.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 39f682a34..c2ab94500 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -13,8 +13,9 @@ from spacy.syntax.arc_eager import ArcEager def get_sequence_costs(M, words, heads, deps, transitions): doc = Doc(Vocab(), words=words) example = Example.from_dict(doc, {"heads": heads, "deps": deps}) - state = StateClass(doc) - M.preprocess_gold(example) + states, golds, _ = M.init_gold_batch([example]) + state = states[0] + gold = golds[0] cost_history = [] for gold_action in transitions: state_costs = {} @@ -23,6 +24,7 @@ def get_sequence_costs(M, words, heads, deps, transitions): state_costs[name] = M.get_cost(state, gold, i) M.transition(state, gold_action) cost_history.append(state_costs) + gold.update(state) return state, cost_history @@ -59,7 +61,6 @@ def gold(doc, words): raise NotImplementedError -@pytest.mark.xfail def test_oracle_four_words(arc_eager, vocab): words = ["a", "b", "c", "d"] heads = [1, 1, 3, 3] @@ -144,12 +145,11 @@ def test_get_oracle_actions(): parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") + heads, deps = projectivize(heads, deps) for i, (head, dep) in enumerate(zip(heads, deps)): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) - heads, deps = projectivize(heads, deps) example = Example.from_dict(doc, {"words": words, "tags": tags, "heads": heads, "deps": deps}) - parser.moves.preprocess_gold(example) parser.moves.get_oracle_sequence(example) From 192b94f0a1a605b7f8239d48921cef1b4365efd0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:15:12 +0200 Subject: [PATCH 46/49] Remove beam test --- spacy/tests/parser/test_nn_beam.py | 100 ----------------------------- 1 file changed, 100 deletions(-) delete mode 100644 spacy/tests/parser/test_nn_beam.py diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py deleted file mode 100644 index 30e0264f4..000000000 --- a/spacy/tests/parser/test_nn_beam.py +++ /dev/null @@ -1,100 +0,0 @@ -import pytest -import numpy -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.pipeline.defaults import default_parser -from spacy.pipeline import DependencyParser -from spacy.syntax.arc_eager import ArcEager -from spacy.tokens import Doc -from spacy.syntax.stateclass import StateClass - - -@pytest.fixture -def vocab(): - return Vocab() - - -@pytest.fixture -def moves(vocab): - aeager = ArcEager(vocab.strings, {}) - aeager.add_action(2, "nsubj") - aeager.add_action(3, "dobj") - aeager.add_action(2, "aux") - return aeager - - -@pytest.fixture -def docs(vocab): - return [Doc(vocab, words=["Rats", "bite", "things"])] - - -@pytest.fixture -def states(docs): - return [StateClass(doc) for doc in docs] - - -@pytest.fixture -def tokvecs(docs, vector_size): - output = [] - for doc in docs: - vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size)) - output.append(numpy.asarray(vec)) - return output - - -@pytest.fixture -def batch_size(docs): - return len(docs) - - -@pytest.fixture -def beam_width(): - return 4 - - -@pytest.fixture -def vector_size(): - return 6 - - -@pytest.fixture -def beam(moves, states, golds, beam_width): - return ParserBeam(moves, states, golds, width=beam_width, density=0.0) - - -@pytest.fixture -def scores(moves, batch_size, beam_width): - return [ - numpy.asarray( - numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f" - ) - for _ in range(batch_size) - ] - - -# All tests below are skipped after removing Beam stuff during the Example/GoldParse refactor -@pytest.mark.skip -def test_create_beam(beam): - pass - - -@pytest.mark.skip -def test_beam_advance(beam, scores): - beam.advance(scores) - - -@pytest.mark.skip -def test_beam_advance_too_few_scores(beam, scores): - with pytest.raises(IndexError): - beam.advance(scores[:-1]) - - -@pytest.mark.skip -def test_beam_parse(): - nlp = Language() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} - nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") - nlp.parser.add_label("nsubj") - nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) - doc = nlp.make_doc("Australia is a country") - nlp.parser(doc, beam_width=2) From 2b180ea03343dbc328cb1d81a62b2a719dd512b9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:15:41 +0200 Subject: [PATCH 47/49] Update test --- spacy/tests/parser/test_parse.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 80d91e7ae..0d9e257b9 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -22,7 +22,6 @@ TRAIN_DATA = [ ] -@pytest.mark.skip # Segfault def test_parser_root(en_tokenizer): text = "i don't have other assistance" heads = [3, 2, 1, 0, 1, -2] @@ -33,9 +32,8 @@ def test_parser_root(en_tokenizer): assert t.dep != 0, t.text -#@pytest.mark.xfail +@pytest.mark.xfail #@pytest.mark.parametrize("text", ["Hello"]) -@pytest.mark.skip # Segfault def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): tokens = en_tokenizer(text) doc = get_doc( @@ -48,7 +46,6 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): assert doc[0].dep != 0 -@pytest.mark.skip # Segfault def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." # heads = [1, 0, 1, -2, -3, -1, -5] @@ -61,7 +58,6 @@ def test_parser_initial(en_tokenizer, en_parser): assert tokens[3].head.i == 3 -@pytest.mark.skip # Segfault def test_parser_parse_subtrees(en_tokenizer, en_parser): text = "The four wheels on the bus turned quickly" heads = [2, 1, 4, -1, 1, -2, 0, -1] @@ -76,7 +72,6 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser): assert len(list(doc[2].subtree)) == 6 -@pytest.mark.skip # Segfault def test_parser_merge_pp(en_tokenizer): text = "A phrase with another phrase occurs" heads = [1, 4, -1, 1, -2, 0] @@ -95,7 +90,6 @@ def test_parser_merge_pp(en_tokenizer): assert doc[3].text == "occurs" -@pytest.mark.skip # Segfault def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): text = "a b c d e" @@ -170,7 +164,6 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): assert tokens[4].head.i == 4 -@pytest.mark.skip # Segfault def test_parser_set_sent_starts(en_vocab): # fmt: off words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] @@ -187,7 +180,6 @@ def test_parser_set_sent_starts(en_vocab): for token in sent: assert token.head in sent -@pytest.mark.skip def test_overfitting_IO(): # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly nlp = English() From 90d9f04e0b268dc9dd288e129d65928432b9ddf8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:16:33 +0200 Subject: [PATCH 48/49] Unskip --- spacy/tests/parser/test_add_label.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 093d4e266..4afa11963 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -44,8 +44,6 @@ def _train_parser(parser): return parser -# Segfaulting due to refactor. Need to fix. -@pytest.mark.skip def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") @@ -64,8 +62,6 @@ def test_add_label(parser): assert doc[2].dep_ == "left" -# segfaulting due to refactor. need to fix. -@pytest.mark.skip def test_add_label_deserializes_correctly(): config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} ner1 = EntityRecognizer(Vocab(), default_ner(), **config) @@ -82,8 +78,6 @@ def test_add_label_deserializes_correctly(): for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i) -# segfaulting due to refactor. need to fix. -@pytest.mark.skip @pytest.mark.parametrize( "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())], From 6670c443904e2a29da0cb0096804eb4507d5f2d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Jun 2020 01:17:52 +0200 Subject: [PATCH 49/49] Unskip tests --- spacy/tests/parser/test_preset_sbd.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 9a2e1cfe8..5a29d84f4 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -33,14 +33,12 @@ def parser(vocab): return parser -@pytest.mark.skip # Segfaults def test_no_sentences(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert len(list(doc.sents)) >= 1 -@pytest.mark.skip # Segfaults def test_sents_1(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[2].sent_start = True @@ -54,7 +52,6 @@ def test_sents_1(parser): assert len(list(doc.sents)) == 2 -@pytest.mark.skip # Segfaults def test_sents_1_2(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True @@ -63,7 +60,6 @@ def test_sents_1_2(parser): assert len(list(doc.sents)) >= 3 -@pytest.mark.skip # Segfaults def test_sents_1_3(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True