diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 6026c4b52..712bc7914 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -118,7 +118,9 @@ def debug_data( # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True) - gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False) + gold_train_unpreprocessed_data = _compile_gold( + train_dataset, pipeline, nlp, make_proj=False + ) gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True) train_texts = gold_train_data["texts"] diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index d0d876aed..5cfde43e0 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -229,7 +229,9 @@ def add_vectors( else: if vectors_loc: with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors) + vectors_data, vector_keys = read_vectors( + msg, vectors_loc, truncate_vectors + ) msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index e58d2529d..5b021aabc 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -406,5 +406,5 @@ def verify_cli_args( if not config["nlp"]["vectors"]: msg.fail( "Must specify nlp.vectors if pretraining.objective.type is vectors", - exits=True + exits=True, ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c053e624c..92fd8c20a 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -202,11 +202,11 @@ def train( nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - train_examples = list(corpus.train_dataset( - nlp, - shuffle=False, - gold_preproc=training["gold_preproc"] - )) + train_examples = list( + corpus.train_dataset( + nlp, shuffle=False, gold_preproc=training["gold_preproc"] + ) + ) nlp.begin_training(lambda: train_examples) # Update tag map with provided mapping @@ -293,12 +293,14 @@ def train( def create_train_batches(nlp, corpus, cfg): max_epochs = cfg.get("max_epochs", 0) - train_examples = list(corpus.train_dataset( - nlp, - shuffle=True, - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"] - )) + train_examples = list( + corpus.train_dataset( + nlp, + shuffle=True, + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"], + ) + ) epoch = 0 while True: @@ -520,7 +522,10 @@ def setup_printer(training, nlp): ) ) data = ( - [info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + [info["epoch"], info["step"]] + + losses + + scores + + ["{0:.2f}".format(float(info["score"]))] ) msg.row(data, widths=table_widths, aligns=table_aligns) diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py index 51321a470..c7e243397 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/gold/converters/iob2docs.py @@ -59,6 +59,6 @@ def read_iob(raw_sents, vocab, n_sents): doc[i].is_sent_start = sent_start biluo = iob_to_biluo(iob) entities = tags_to_entities(biluo) - doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities] + doc.ents = [Span(doc, start=s, end=e + 1, label=L) for (L, s, e) in entities] docs.append(doc) return docs diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index cd606fecf..08751cfd4 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -92,7 +92,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): # Handle entity cases for start_char, end_char, label in entities: if not label: - for s in starts: # account for many-to-one + for s in starts: # account for many-to-one if s >= start_char and s < end_char: biluo[starts[s]] = "O" else: diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index d436b1cf6..c1e530d4a 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -17,11 +17,7 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - list2array(), - Linear(hidden_width, t2v_width), - ) + tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 38e6114de..e2b6adf43 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -179,22 +179,9 @@ def test_doc_api_right_edge(en_tokenizer): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) assert doc[6].text == "for" subtree = [w.text for w in doc[6].subtree] - assert subtree == [ - "for", - "the", - "sake", - "of", - "such", - "as", - "live", - "under", - "the", - "government", - "of", - "the", - "Romans", - ",", - ] + # fmt: off + assert subtree == ["for", "the", "sake", "of", "such", "as", "live", "under", "the", "government", "of", "the", "Romans", ","] + # fmt: on assert doc[6].right_edge.text == "," @@ -307,9 +294,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_texts = ["Merging the docs is fun.", "They don't think alike."] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] - docs_idx = en_texts[0].index('docs') + docs_idx = en_texts[0].index("docs") de_doc = de_tokenizer(de_text) - en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None) + en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = ( + True, + None, + None, + None, + ) assert Doc.from_docs([]) is None @@ -323,15 +315,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert len(en_docs) == len(list(m_doc.sents)) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert str(m_doc) == " ".join(en_texts) - p_token = m_doc[len(en_docs[0])-1] + p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') + think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think") assert m_doc[9].idx == think_idx with pytest.raises(AttributeError): - not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension - assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there + # not callable, because it was not set via set_extension + m_doc[2]._.is_ambiguous + assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_docs) == len(list(m_doc.sents)) @@ -341,19 +334,21 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think') + think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think") assert m_doc[9].idx == think_idx - m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos']) - with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing + m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) + with pytest.raises(ValueError): + # important attributes from sentenziser or parser are missing assert list(m_doc.sents) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing + # space delimiter considered, although spacy attribute was missing + assert str(m_doc) == " ".join(en_texts) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') + think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think") assert m_doc[9].idx == think_idx diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 6528a4223..81484c083 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -118,6 +118,7 @@ def test_oracle_moves_missing_B(en_vocab): moves.add_action(move_types.index("U"), label) moves.get_oracle_sequence(example) + # We can't easily represent this on a Doc object. Not sure what the best solution # would be, but I don't think it's an important use case? @pytest.mark.xfail(reason="No longer supported") diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index f13b7e847..c54088f56 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -91,6 +91,7 @@ def test_parser_merge_pp(en_tokenizer): assert doc[2].text == "another phrase" assert doc[3].text == "occurs" + # We removed the step_through API a while ago. we should bring it back though @pytest.mark.xfail(reason="Unsupported") def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index fc05444d5..9c596aaf6 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -8,10 +8,11 @@ from ...tokens import DocBin def test_issue4402(): nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" docs = json2docs([json_data]) - data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() + data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) @@ -25,74 +26,73 @@ def test_issue4402(): assert len(split_train_data) == 4 -json_data =\ - { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], - } +json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], +} diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index e8928f33a..35ca47268 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -28,7 +28,9 @@ def test_cli_converters_conllu2json(): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] - ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + ent_offsets = [ + (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] + ] biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"] @@ -54,7 +56,9 @@ def test_cli_converters_conllu2json(): ) def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) - converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) + converted_docs = conllu2docs( + input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} + ) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 @@ -68,7 +72,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] - ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + ent_offsets = [ + (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] + ] biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] @@ -115,7 +121,9 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] - ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + ent_offsets = [ + (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] + ] biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"] @@ -138,11 +146,11 @@ def test_cli_converters_iob2json(en_vocab): sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] - # fmt: off - assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] + expected = ["I", "like", "London", "and", "New", "York", "City", "."] + assert [t["orth"] for t in tokens] == expected assert len(converted_docs[0].ents) == 8 for ent in converted_docs[0].ents: - assert(ent.text in ["New York City", "London"]) + assert ent.text in ["New York City", "London"] def test_cli_converters_conll_ner2json(): @@ -210,7 +218,7 @@ def test_cli_converters_conll_ner2json(): # fmt: on assert len(converted_docs[0].ents) == 10 for ent in converted_docs[0].ents: - assert (ent.text in ["New York City", "London"]) + assert ent.text in ["New York City", "London"] def test_pretrain_make_docs(): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 96acb8982..a5e11ea28 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -161,65 +161,54 @@ def test_example_from_dict_no_ner(en_vocab): ner_tags = example.get_aligned_ner() assert ner_tags == [None, None, None, None] + def test_example_from_dict_some_ner(en_vocab): words = ["a", "b", "c", "d"] spaces = [True, True, False, True] predicted = Doc(en_vocab, words=words, spaces=spaces) example = Example.from_dict( - predicted, - { - "words": words, - "entities": ["U-LOC", None, None, None] - } + predicted, {"words": words, "entities": ["U-LOC", None, None, None]} ) ner_tags = example.get_aligned_ner() assert ner_tags == ["U-LOC", None, None, None] def test_json2docs_no_ner(en_vocab): - data = [{ - "id":1, - "paragraphs":[ - { - "sentences":[ - { - "tokens":[ - { - "dep":"nn", - "head":1, - "tag":"NNP", - "orth":"Ms." - }, - { - "dep":"nsubj", - "head":1, - "tag":"NNP", - "orth":"Haag" - }, - { - "dep":"ROOT", - "head":0, - "tag":"VBZ", - "orth":"plays" - }, - { - "dep":"dobj", - "head":-1, - "tag":"NNP", - "orth":"Elianti" - }, - { - "dep":"punct", - "head":-2, - "tag":".", - "orth":"." - } + data = [ + { + "id": 1, + "paragraphs": [ + { + "sentences": [ + { + "tokens": [ + {"dep": "nn", "head": 1, "tag": "NNP", "orth": "Ms."}, + { + "dep": "nsubj", + "head": 1, + "tag": "NNP", + "orth": "Haag", + }, + { + "dep": "ROOT", + "head": 0, + "tag": "VBZ", + "orth": "plays", + }, + { + "dep": "dobj", + "head": -1, + "tag": "NNP", + "orth": "Elianti", + }, + {"dep": "punct", "head": -2, "tag": ".", "orth": "."}, + ] + } ] - } - ] - } - ] - }] + } + ], + } + ] docs = json2docs(data) assert len(docs) == 1 for doc in docs: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index edc183e0d..f2374bdc6 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -8,8 +8,9 @@ from ..tokens import Doc from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors - +# fmt: off ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") +# fmt: on class DocBin(object): @@ -86,9 +87,7 @@ class DocBin(object): assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) - self.flags.append({ - "has_unknown_spaces": doc.has_unknown_spaces - }) + self.flags.append({"has_unknown_spaces": doc.has_unknown_spaces}) for token in doc: self.strings.add(token.text) self.strings.add(token.tag_)