From f90482d0778f68deea9acda4ad47dd7ef6070141 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Jul 2021 15:44:56 +1000 Subject: [PATCH] Tidy up and auto-format --- spacy/__init__.py | 2 +- spacy/language.py | 2 +- spacy/tests/doc/test_creation.py | 2 +- spacy/tests/parser/test_ner.py | 4 +- spacy/tests/parser/test_parse.py | 4 +- .../pipeline/test_annotates_on_update.py | 2 +- spacy/tests/pipeline/test_lemmatizer.py | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 8 +- spacy/tests/regression/test_issue5001-5500.py | 13 +- spacy/tests/regression/test_issue7001-8000.py | 281 ++++++++++++++++++ spacy/tests/regression/test_issue7019.py | 12 - spacy/tests/regression/test_issue7029.py | 66 ---- spacy/tests/regression/test_issue7055.py | 40 --- spacy/tests/regression/test_issue7056.py | 24 -- spacy/tests/regression/test_issue7062.py | 54 ---- spacy/tests/regression/test_issue7065.py | 97 ------ .../serialize/test_serialize_pipeline.py | 6 - spacy/tests/test_cli.py | 2 +- spacy/tests/test_models.py | 2 +- 19 files changed, 304 insertions(+), 319 deletions(-) create mode 100644 spacy/tests/regression/test_issue7001-8000.py delete mode 100644 spacy/tests/regression/test_issue7019.py delete mode 100644 spacy/tests/regression/test_issue7029.py delete mode 100644 spacy/tests/regression/test_issue7055.py delete mode 100644 spacy/tests/regression/test_issue7056.py delete mode 100644 spacy/tests/regression/test_issue7062.py delete mode 100644 spacy/tests/regression/test_issue7065.py diff --git a/spacy/__init__.py b/spacy/__init__.py index f20c32eb5..ca47edc94 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,7 +5,7 @@ import sys # set library-specific custom warning handling before doing anything else from .errors import setup_default_warnings -setup_default_warnings() +setup_default_warnings() # noqa: E402 # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 diff --git a/spacy/language.py b/spacy/language.py index b60c92158..7ceb266bf 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1447,7 +1447,7 @@ class Language: ) -> Iterator[Tuple[Doc, _AnyContext]]: ... - def pipe( + def pipe( # noqa: F811 self, texts: Iterable[str], *, diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 6c9de8f07..6989b965f 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -69,4 +69,4 @@ def test_create_with_heads_and_no_deps(vocab): words = "I like ginger".split() heads = list(range(len(words))) with pytest.raises(ValueError): - doc = Doc(vocab, words=words, heads=heads) + Doc(vocab, words=words, heads=heads) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index ee9b6bf01..a30001b27 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -329,8 +329,8 @@ def test_ner_constructor(en_vocab): } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner_1 = EntityRecognizer(en_vocab, model, **config) - ner_2 = EntityRecognizer(en_vocab, model) + EntityRecognizer(en_vocab, model, **config) + EntityRecognizer(en_vocab, model) def test_ner_before_ruler(): diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 1b0d9d256..b7575d063 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -224,8 +224,8 @@ def test_parser_constructor(en_vocab): } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser_1 = DependencyParser(en_vocab, model, **config) - parser_2 = DependencyParser(en_vocab, model) + DependencyParser(en_vocab, model, **config) + DependencyParser(en_vocab, model) @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py index c5288112d..869b8b874 100644 --- a/spacy/tests/pipeline/test_annotates_on_update.py +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -74,7 +74,7 @@ def test_annotates_on_update(): nlp.add_pipe("assert_sents") # When the pipeline runs, annotations are set - doc = nlp("This is a sentence.") + nlp("This is a sentence.") examples = [] for text in ["a a", "b b", "c c"]: diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 1bec8696c..0d2d3d6e5 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -110,4 +110,4 @@ def test_lemmatizer_serialize(nlp): assert doc2[0].lemma_ == "cope" # Make sure that lemmatizer cache can be pickled - b = pickle.dumps(lemmatizer2) + pickle.dumps(lemmatizer2) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index e530cb5c4..87fd64307 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -52,7 +52,7 @@ def test_cant_add_pipe_first_and_last(nlp): nlp.add_pipe("new_pipe", first=True, last=True) -@pytest.mark.parametrize("name", ["my_component"]) +@pytest.mark.parametrize("name", ["test_get_pipe"]) def test_get_pipe(nlp, name): with pytest.raises(KeyError): nlp.get_pipe(name) @@ -62,7 +62,7 @@ def test_get_pipe(nlp, name): @pytest.mark.parametrize( "name,replacement,invalid_replacement", - [("my_component", "other_pipe", lambda doc: doc)], + [("test_replace_pipe", "other_pipe", lambda doc: doc)], ) def test_replace_pipe(nlp, name, replacement, invalid_replacement): with pytest.raises(ValueError): @@ -435,8 +435,8 @@ def test_update_with_annotates(): return component - c1 = Language.component(f"{name}1", func=make_component(f"{name}1")) - c2 = Language.component(f"{name}2", func=make_component(f"{name}2")) + Language.component(f"{name}1", func=make_component(f"{name}1")) + Language.component(f"{name}2", func=make_component(f"{name}2")) components = set([f"{name}1", f"{name}2"]) diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py index 9eefef2e5..bc9bcb982 100644 --- a/spacy/tests/regression/test_issue5001-5500.py +++ b/spacy/tests/regression/test_issue5001-5500.py @@ -69,9 +69,12 @@ def test_issue5082(): def test_issue5137(): - @Language.factory("my_component") + factory_name = "test_issue5137" + pipe_name = "my_component" + + @Language.factory(factory_name) class MyComponent: - def __init__(self, nlp, name="my_component", categories="all_categories"): + def __init__(self, nlp, name=pipe_name, categories="all_categories"): self.nlp = nlp self.categories = categories self.name = name @@ -86,13 +89,13 @@ def test_issue5137(): pass nlp = English() - my_component = nlp.add_pipe("my_component") + my_component = nlp.add_pipe(factory_name, name=pipe_name) assert my_component.categories == "all_categories" with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) - overrides = {"components": {"my_component": {"categories": "my_categories"}}} + overrides = {"components": {pipe_name: {"categories": "my_categories"}}} nlp2 = spacy.load(tmpdir, config=overrides) - assert nlp2.get_pipe("my_component").categories == "my_categories" + assert nlp2.get_pipe(pipe_name).categories == "my_categories" def test_issue5141(en_vocab): diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py new file mode 100644 index 000000000..5bb7cc08e --- /dev/null +++ b/spacy/tests/regression/test_issue7001-8000.py @@ -0,0 +1,281 @@ +from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type +from spacy.lang.en import English +from spacy.training import Example +from spacy.tokens.doc import Doc +from spacy.vocab import Vocab +from spacy.kb import KnowledgeBase +from spacy.pipeline._parser_internals.arc_eager import ArcEager +from spacy.util import load_config_from_str, load_config +from spacy.cli.init_config import fill_config +from thinc.api import Config +from wasabi import msg + +from ..util import make_tempdir + + +def test_issue7019(): + scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} + print_textcats_auc_per_cat(msg, scores) + scores = { + "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, + "LABEL_B": {"p": None, "r": None, "f": None}, + } + print_prf_per_type(msg, scores, name="foo", type="bar") + + +CONFIG_7029 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch.""" + TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ] + nlp = English.from_config(load_config_from_str(CONFIG_7029)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] + + +def test_issue7055(): + """Test that fill-config doesn't turn sourced components into factories.""" + source_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, + "components": { + "tok2vec": {"factory": "tok2vec"}, + "tagger": {"factory": "tagger"}, + }, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + base_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": str(source_path)}, + "tagger": {"source": str(source_path)}, + "ner": {"factory": "ner"}, + }, + } + base_cfg = Config(base_cfg) + base_path = dir_path / "base.cfg" + base_cfg.to_disk(base_path) + output_path = dir_path / "config.cfg" + fill_config(output_path, base_path, silent=True) + filled_cfg = load_config(output_path) + assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) + assert filled_cfg["components"]["tagger"]["source"] == str(source_path) + assert filled_cfg["components"]["ner"]["factory"] == "ner" + assert "model" in filled_cfg["components"]["ner"] + + +def test_issue7056(): + """Test that the Unshift transition works properly, and doesn't cause + sentence segmentation errors.""" + vocab = Vocab() + ae = ArcEager( + vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + ) + doc = Doc(vocab, words="Severe pain , after trauma".split()) + state = ae.init_batch([doc])[0] + ae.apply_transition(state, "S") + ae.apply_transition(state, "L-amod") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "R-pobj") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + assert not state.eol() + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + TRAIN_DATA = [ + ( + "Russ Cochran his reprints include EC Comics.", + { + "links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"] + + +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + { + "label": "THING", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + } + ] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 + + +def test_issue7065_b(): + # Test that the NEL doesn't crash when an entity crosses a sentence boundary + nlp = English() + vector_length = 3 + nlp.add_pipe("sentencizer") + text = "Mahler 's Symphony No. 8 was beautiful." + entities = [(0, 6, "PERSON"), (10, 24, "WORK")] + links = { + (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, + (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, + } + sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + doc = nlp(text) + example = Example.from_dict( + doc, {"entities": entities, "links": links, "sent_starts": sent_starts} + ) + train_examples = [example] + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="No. 8", + entities=["Q270853"], + probabilities=[1.0], + ) + mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias( + alias="Mahler", + entities=["Q7304"], + probabilities=[1.0], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + # train the NEL pipe + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # Add a custom rule-based component to mimick NER + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, + { + "label": "WORK", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + }, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + # test the trained model - this should not throw E148 + doc = nlp(text) + assert doc diff --git a/spacy/tests/regression/test_issue7019.py b/spacy/tests/regression/test_issue7019.py deleted file mode 100644 index 53958b594..000000000 --- a/spacy/tests/regression/test_issue7019.py +++ /dev/null @@ -1,12 +0,0 @@ -from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type -from wasabi import msg - - -def test_issue7019(): - scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} - print_textcats_auc_per_cat(msg, scores) - scores = { - "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, - "LABEL_B": {"p": None, "r": None, "f": None}, - } - print_prf_per_type(msg, scores, name="foo", type="bar") diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py deleted file mode 100644 index 8435b32e1..000000000 --- a/spacy/tests/regression/test_issue7029.py +++ /dev/null @@ -1,66 +0,0 @@ -from spacy.lang.en import English -from spacy.training import Example -from spacy.util import load_config_from_str - - -CONFIG = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), -] - - -def test_issue7029(): - """Test that an empty document doesn't mess up an entire batch.""" - nlp = English.from_config(load_config_from_str(CONFIG)) - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(50): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] - docs1 = list(nlp.pipe(texts, batch_size=1)) - docs2 = list(nlp.pipe(texts, batch_size=4)) - assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] diff --git a/spacy/tests/regression/test_issue7055.py b/spacy/tests/regression/test_issue7055.py deleted file mode 100644 index c7ddb0a75..000000000 --- a/spacy/tests/regression/test_issue7055.py +++ /dev/null @@ -1,40 +0,0 @@ -from spacy.cli.init_config import fill_config -from spacy.util import load_config -from spacy.lang.en import English -from thinc.api import Config - -from ..util import make_tempdir - - -def test_issue7055(): - """Test that fill-config doesn't turn sourced components into factories.""" - source_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, - "components": { - "tok2vec": {"factory": "tok2vec"}, - "tagger": {"factory": "tagger"}, - }, - } - source_nlp = English.from_config(source_cfg) - with make_tempdir() as dir_path: - # We need to create a loadable source pipeline - source_path = dir_path / "test_model" - source_nlp.to_disk(source_path) - base_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, - "components": { - "tok2vec": {"source": str(source_path)}, - "tagger": {"source": str(source_path)}, - "ner": {"factory": "ner"}, - }, - } - base_cfg = Config(base_cfg) - base_path = dir_path / "base.cfg" - base_cfg.to_disk(base_path) - output_path = dir_path / "config.cfg" - fill_config(output_path, base_path, silent=True) - filled_cfg = load_config(output_path) - assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) - assert filled_cfg["components"]["tagger"]["source"] == str(source_path) - assert filled_cfg["components"]["ner"]["factory"] == "ner" - assert "model" in filled_cfg["components"]["ner"] diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py deleted file mode 100644 index e94a975d4..000000000 --- a/spacy/tests/regression/test_issue7056.py +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.tokens.doc import Doc -from spacy.vocab import Vocab -from spacy.pipeline._parser_internals.arc_eager import ArcEager - - -def test_issue7056(): - """Test that the Unshift transition works properly, and doesn't cause - sentence segmentation errors.""" - vocab = Vocab() - ae = ArcEager( - vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) - ) - doc = Doc(vocab, words="Severe pain , after trauma".split()) - state = ae.init_batch([doc])[0] - ae.apply_transition(state, "S") - ae.apply_transition(state, "L-amod") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "R-pobj") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - assert not state.eol() diff --git a/spacy/tests/regression/test_issue7062.py b/spacy/tests/regression/test_issue7062.py deleted file mode 100644 index 66bf09523..000000000 --- a/spacy/tests/regression/test_issue7062.py +++ /dev/null @@ -1,54 +0,0 @@ -from spacy.kb import KnowledgeBase -from spacy.training import Example -from spacy.lang.en import English - - -# fmt: off -TRAIN_DATA = [ - ("Russ Cochran his reprints include EC Comics.", - {"links": {(0, 12): {"Q2146908": 1.0}}, - "entities": [(0, 12, "PERSON")], - "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}) -] -# fmt: on - - -def test_partial_links(): - # Test that having some entities on the doc without gold links, doesn't crash - nlp = English() - vector_length = 3 - train_examples = [] - for text, annotation in TRAIN_DATA: - doc = nlp(text) - train_examples.append(Example.from_dict(doc, annotation)) - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) - return mykb - - # Create and train the Entity Linker - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # adding additional components that are required for the entity_linker - nlp.add_pipe("sentencizer", first=True) - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, - {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - - # this will run the pipeline on the examples and shouldn't crash - results = nlp.evaluate(train_examples) - assert "PERSON" in results["ents_per_type"] - assert "PERSON" in results["nel_f_per_type"] - assert "ORG" in results["ents_per_type"] - assert "ORG" not in results["nel_f_per_type"] diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py deleted file mode 100644 index d40763c63..000000000 --- a/spacy/tests/regression/test_issue7065.py +++ /dev/null @@ -1,97 +0,0 @@ -from spacy.kb import KnowledgeBase -from spacy.lang.en import English -from spacy.training import Example - - -def test_issue7065(): - text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." - nlp = English() - nlp.add_pipe("sentencizer") - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - { - "label": "THING", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - } - ] - ruler.add_patterns(patterns) - - doc = nlp(text) - sentences = [s for s in doc.sents] - assert len(sentences) == 2 - sent0 = sentences[0] - ent = doc.ents[0] - assert ent.start < sent0.end < ent.end - assert sentences.index(ent.sent) == 0 - - -def test_issue7065_b(): - # Test that the NEL doesn't crash when an entity crosses a sentence boundary - nlp = English() - vector_length = 3 - nlp.add_pipe("sentencizer") - - text = "Mahler 's Symphony No. 8 was beautiful." - entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = { - (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, - } - sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] - doc = nlp(text) - example = Example.from_dict( - doc, {"entities": entities, "links": links, "sent_starts": sent_starts} - ) - train_examples = [example] - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias( - alias="No. 8", - entities=["Q270853"], - probabilities=[1.0], - ) - mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias( - alias="Mahler", - entities=["Q7304"], - probabilities=[1.0], - ) - return mykb - - # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - - # train the NEL pipe - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # Add a custom rule-based component to mimick NER - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - { - "label": "WORK", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - }, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - - # test the trained model - this should not throw E148 - doc = nlp(text) - assert doc diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 35cc22d24..c8162a690 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -60,12 +60,6 @@ def taggers(en_vocab): @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): - config = { - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = Parser(en_vocab, model) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 03bef3528..6f0fdcfa5 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -440,7 +440,7 @@ def test_init_config(lang, pipeline, optimize, pretraining): assert isinstance(config, Config) if pretraining: config["paths"]["raw_text"] = "my_data.jsonl" - nlp = load_model_from_config(config, auto_fill=True) + load_model_from_config(config, auto_fill=True) def test_model_recommendations(): diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 33d394933..47540198a 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -211,7 +211,7 @@ def test_empty_docs(model_func, kwargs): def test_init_extract_spans(): - model = extract_spans().initialize() + extract_spans().initialize() def test_extract_spans_span_indices():