Tidy up and auto-format

2025-08-06 21:30:22 +03:00 · 2021-02-13 12:55:56 +11:00 · 2021-02-13 12:55:56 +11:00 · 9ba715ed16
commit 9ba715ed16
parent 06e66d4ced
15 changed files with 285 additions and 286 deletions
--- a/spacy/lang/am/stop_words.py
+++ b/spacy/lang/am/stop_words.py
@ -4,30 +4,30 @@
 STOP_WORDS = set(
    """
 ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን
-ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ 
-አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ 
-አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ 
-አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል  ተገልጿል 
-ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን 
-ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል 
-ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች 
-እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ 
-ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ 
-ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ 
-ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም 
-በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና  ይሆኑበታል በሆነ አንዱ 
-ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት 
-የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት  ጋር ቢያንስ 
+ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ
+አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ
+አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ
+አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል  ተገልጿል
+ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን
+ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል
+ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች
+እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ
+ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ
+ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ
+ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም
+በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና  ይሆኑበታል በሆነ አንዱ
+ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት
+የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት  ጋር ቢያንስ
 ይህንንም እነደሆነ እነዚህን ይኸው  የማናቸውም
-በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ  
-የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ 
-ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም 
-የማይበልጥ እንደሆነና እንዲሆኑ  በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን 
-ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ 
-ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ 
-በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና 
-በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም  
-ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ 
+በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ
+የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ
+ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም
+የማይበልጥ እንደሆነና እንዲሆኑ  በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን
+ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ
+ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ
+በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና
+በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም
+ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ
 ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን  ለማናቸውም
 """.split()
 )
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@ -78,6 +78,7 @@ _ordinal_words = [
    "bazillione",
 ]

+
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
@ -98,7 +99,7 @@ def like_num(text):
        return True
    if text_lower.endswith("th"):
        if text_lower[:-2].isdigit():
-            return True 
+            return True

    return False

--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@ -16,4 +16,4 @@ _infixes = (
 )


-TOKENIZER_INFIXES = _infixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@ -1,18 +1,20 @@
 # Stop words
-STOP_WORDS = set("""
-ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
+STOP_WORDS = set(
+    """
+ke gareng ga selekanyo tlhwatlhwa yo mongwe se
 sengwe fa go le jalo gongwe ba na mo tikologong
-jaaka kwa morago nna gonne ka sa pele nako teng 
+jaaka kwa morago nna gonne ka sa pele nako teng
 tlase fela ntle magareng tsona feta bobedi kgabaganya
 moo gape kgatlhanong botlhe tsotlhe bokana e esi
-setseng mororo dinako golo kgolo nnye wena gago 
-o ntse ntle tla goreng gangwe mang yotlhe gore 
-eo yona tseraganyo eng ne sentle re rona thata 
-godimo fitlha pedi masomamabedi lesomepedi mmogo 
-tharo tseo boraro tseno yone jaanong bobona bona 
-lesome tsaya tsamaiso nngwe masomethataro thataro 
+setseng mororo dinako golo kgolo nnye wena gago
+o ntse ntle tla goreng gangwe mang yotlhe gore
+eo yona tseraganyo eng ne sentle re rona thata
+godimo fitlha pedi masomamabedi lesomepedi mmogo
+tharo tseo boraro tseno yone jaanong bobona bona
+lesome tsaya tsamaiso nngwe masomethataro thataro
 tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
-bonala e tshwanang bogolo tsenya tsweetswee karolo 
-sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
-tlhano lesometlhano botlalo lekgolo           
-""".split())
+bonala e tshwanang bogolo tsenya tsweetswee karolo
+sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
+tlhano lesometlhano botlalo lekgolo
+""".split()
+)
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -76,7 +76,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
    retokenizes=True,
 )
 def make_token_splitter(
-    nlp: Language, name: str, *, min_length=0, split_length=0,
+    nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
 ):
    return TokenSplitter(min_length=min_length, split_length=split_length)

--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -327,8 +327,10 @@ def test_phrase_matcher_sent_start(en_vocab, attr):

 def test_span_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher accepts Span and Doc as input"""
-    doc = Doc(en_vocab,
-              words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
+    # fmt: off
+    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
    span = doc[:8]
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
@ -341,10 +343,14 @@ def test_span_in_phrasematcher(en_vocab):

 def test_span_v_doc_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
-    doc = Doc(en_vocab,
-              words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", 
-                    "Spans", "and", "Docs", "in", "my", "matchers", ","
-                    "and", "Spans", "and", "Docs", "everywhere" "."])
+    # fmt: off
+    words = [
+        "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans",
+        "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs",
+        "everywhere", "."
+    ]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
    span = doc[9:15]  # second clause
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
--- a/spacy/tests/regression/test_issue6501-7000.py
+++ b/spacy/tests/regression/test_issue6501-7000.py
@ -0,0 +1,229 @@
+import pytest
+from spacy.lang.en import English
+import numpy as np
+import spacy
+from spacy.tokens import Doc
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import DocBin
+from spacy.util import load_config_from_str
+from spacy.training import Example
+from spacy.training.initialize import init_nlp
+import pickle
+
+from ..util import make_tempdir
+
+
+def test_issue6730(en_vocab):
+    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
+    from spacy.kb import KnowledgeBase
+
+    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
+    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
+
+    with pytest.raises(ValueError):
+        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
+    assert kb.contains_alias("") is False
+
+    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
+    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
+
+    with make_tempdir() as tmp_dir:
+        kb.to_disk(tmp_dir)
+        kb.from_disk(tmp_dir)
+    assert kb.get_size_aliases() == 2
+    assert set(kb.get_alias_strings()) == {"x", "y"}
+
+
+def test_issue6755(en_tokenizer):
+    doc = en_tokenizer("This is a magnificent sentence.")
+    span = doc[:0]
+    assert span.text_with_ws == ""
+    assert span.text == ""
+
+
+@pytest.mark.parametrize(
+    "sentence, start_idx,end_idx,label",
+    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
+)
+def test_issue6815_1(sentence, start_idx, end_idx, label):
+    nlp = English()
+    doc = nlp(sentence)
+    span = doc[:].char_span(start_idx, end_idx, label=label)
+    assert span.label_ == label
+
+
+@pytest.mark.parametrize(
+    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
+)
+def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
+    nlp = English()
+    doc = nlp(sentence)
+    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
+    assert span.kb_id == kb_id
+
+
+@pytest.mark.parametrize(
+    "sentence, start_idx,end_idx,vector",
+    [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
+)
+def test_issue6815_3(sentence, start_idx, end_idx, vector):
+    nlp = English()
+    doc = nlp(sentence)
+    span = doc[:].char_span(start_idx, end_idx, vector=vector)
+    assert (span.vector == vector).all()
+
+
+def test_issue6839(en_vocab):
+    """Ensure that PhraseMatcher accepts Span as input"""
+    # fmt: off
+    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
+    span = doc[:8]
+    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("SPACY", [pattern])
+    matches = matcher(span)
+    assert matches
+
+
+CONFIG_ISSUE_6908 = """
+[paths]
+train = "TRAIN_PLACEHOLDER"
+raw = null
+init_tok2vec = null
+vectors = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+
+[components]
+
+[components.textcat]
+factory = "TEXTCAT_PLACEHOLDER"
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+
+[training]
+train_corpus = "corpora.train"
+dev_corpus = "corpora.dev"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+frozen_components = []
+before_to_disk = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.components.textcat]
+labels = ['label1', 'label2']
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.mark.parametrize(
+    "component_name", ["textcat", "textcat_multilabel"],
+)
+def test_issue6908(component_name):
+    """Test intializing textcat with labels in a list"""
+
+    def create_data(out_file):
+        nlp = spacy.blank("en")
+        doc = nlp.make_doc("Some text")
+        doc.cats = {"label1": 0, "label2": 1}
+        out_data = DocBin(docs=[doc]).to_bytes()
+        with out_file.open("wb") as file_:
+            file_.write(out_data)
+
+    with make_tempdir() as tmp_path:
+        train_path = tmp_path / "train.spacy"
+        create_data(train_path)
+        config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
+        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
+        config = load_config_from_str(config_str)
+        init_nlp(config)
+
+
+CONFIG_ISSUE_6950 = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.ner]
+factory = "ner"
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+upstream = "*"
+"""
+
+
+def test_issue6950():
+    """Test that the nlp object with initialized tok2vec with listeners pickles
+    correctly (and doesn't have lambdas).
+    """
+    nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
+    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
+    pickle.dumps(nlp)
+    nlp("hello")
+    pickle.dumps(nlp)
--- a/spacy/tests/regression/test_issue6730.py
+++ b/spacy/tests/regression/test_issue6730.py
@ -1,23 +0,0 @@
-import pytest
-from ..util import make_tempdir
-
-
-def test_issue6730(en_vocab):
-    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
-    from spacy.kb import KnowledgeBase
-
-    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
-    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
-
-    with pytest.raises(ValueError):
-        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
-    assert kb.contains_alias("") is False
-
-    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
-    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
-
-    with make_tempdir() as tmp_dir:
-        kb.to_disk(tmp_dir)
-        kb.from_disk(tmp_dir)
-    assert kb.get_size_aliases() == 2
-    assert set(kb.get_alias_strings()) == {"x", "y"}
--- a/spacy/tests/regression/test_issue6755.py
+++ b/spacy/tests/regression/test_issue6755.py
@ -1,5 +0,0 @@
-def test_issue6755(en_tokenizer):
-    doc = en_tokenizer("This is a magnificent sentence.")
-    span = doc[:0]
-    assert span.text_with_ws == ""
-    assert span.text == ""
--- a/spacy/tests/regression/test_issue6815.py
+++ b/spacy/tests/regression/test_issue6815.py
@ -1,35 +0,0 @@
-import pytest
-from spacy.lang.en import English
-import numpy as np
-
-
-@pytest.mark.parametrize(
-    "sentence, start_idx,end_idx,label",
-    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
-)
-def test_char_span_label(sentence, start_idx, end_idx, label):
-    nlp = English()
-    doc = nlp(sentence)
-    span = doc[:].char_span(start_idx, end_idx, label=label)
-    assert span.label_ == label
-
-
-@pytest.mark.parametrize(
-    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
-)
-def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
-    nlp = English()
-    doc = nlp(sentence)
-    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
-    assert span.kb_id == kb_id
-
-
-@pytest.mark.parametrize(
-    "sentence, start_idx,end_idx,vector",
-    [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
-)
-def test_char_span_vector(sentence, start_idx, end_idx, vector):
-    nlp = English()
-    doc = nlp(sentence)
-    span = doc[:].char_span(start_idx, end_idx, vector=vector)
-    assert (span.vector == vector).all()
--- a/spacy/tests/regression/test_issue6839.py
+++ b/spacy/tests/regression/test_issue6839.py
@ -1,15 +0,0 @@
-from spacy.tokens import Doc
-from spacy.matcher import PhraseMatcher
-
-
-def test_span_in_phrasematcher(en_vocab):
-    """Ensure that PhraseMatcher accepts Span as input"""
-    doc = Doc(en_vocab,
-              words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
-    span = doc[:8]
-    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("SPACY", [pattern])
-    matches = matcher(span)
-    assert matches
-
--- a/spacy/tests/regression/test_issue6908.py
+++ b/spacy/tests/regression/test_issue6908.py
@ -1,102 +0,0 @@
-import pytest
-import spacy
-from spacy.language import Language
-from spacy.tokens import DocBin
-from spacy import util
-from spacy.schemas import ConfigSchemaInit
-
-from spacy.training.initialize import init_nlp
-
-from ..util import make_tempdir
-
-TEXTCAT_WITH_LABELS_ARRAY_CONFIG = """
-[paths]
-train = "TRAIN_PLACEHOLDER"
-raw = null
-init_tok2vec = null
-vectors = null
-
-[system]
-seed = 0
-gpu_allocator = null
-
-[nlp]
-lang = "en"
-pipeline = ["textcat"]
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-batch_size = 1000
-
-[components]
-
-[components.textcat]
-factory = "TEXTCAT_PLACEHOLDER"
-
-[corpora]
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-
-
-[training]
-train_corpus = "corpora.train"
-dev_corpus = "corpora.dev"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-frozen_components = []
-before_to_disk = null
-
-[pretraining]
-
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-
-[initialize.components]
-
-[initialize.components.textcat]
-labels = ['label1', 'label2']
-
-[initialize.tokenizer]
-"""
-
-
-@pytest.mark.parametrize(
-    "component_name",
-    ["textcat", "textcat_multilabel"],
-)
-def test_textcat_initialize_labels_validation(component_name):
-    """Test intializing textcat with labels in a list"""
-
-    def create_data(out_file):
-        nlp = spacy.blank("en")
-        doc = nlp.make_doc("Some text")
-        doc.cats = {"label1": 0, "label2": 1}
-
-        out_data = DocBin(docs=[doc]).to_bytes()
-        with out_file.open("wb") as file_:
-            file_.write(out_data)
-
-    with make_tempdir() as tmp_path:
-        train_path = tmp_path / "train.spacy"
-        create_data(train_path)
-
-        config_str = TEXTCAT_WITH_LABELS_ARRAY_CONFIG.replace(
-            "TEXTCAT_PLACEHOLDER", component_name
-        )
-        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
-
-        config = util.load_config_from_str(config_str)
-        init_nlp(config)
--- a/spacy/tests/regression/test_issue6950.py
+++ b/spacy/tests/regression/test_issue6950.py
@ -1,59 +0,0 @@
-from spacy.lang.en import English
-from spacy.training import Example
-from spacy.util import load_config_from_str
-import pickle
-
-
-CONFIG = """
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger"]
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-rows = [5000,2500,2500,2500]
-include_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
-[components.ner]
-factory = "ner"
-
-[components.tagger]
-factory = "tagger"
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-nO = null
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-upstream = "*"
-"""
-
-
-def test_issue6950():
-    """Test that the nlp object with initialized tok2vec with listeners pickles
-    correctly (and doesn't have lambdas).
-    """
-    nlp = English.from_config(load_config_from_str(CONFIG))
-    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
-    pickle.dumps(nlp)
-    nlp("hello")
-    pickle.dumps(nlp)
--- a/spacy/tests/regression/test_issue7029.py
+++ b/spacy/tests/regression/test_issue7029.py
@ -51,8 +51,7 @@ TRAIN_DATA = [


 def test_issue7029():
-    """Test that an empty document doesn't mess up an entire batch.
-    """
+    """Test that an empty document doesn't mess up an entire batch."""
    nlp = English.from_config(load_config_from_str(CONFIG))
    train_examples = []
    for t in TRAIN_DATA:
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@ -57,6 +57,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
    assert en_vocab["dogs"].check_flag(is_len4) is True
    en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT)

+
 def test_vocab_lexeme_oov_rank(en_vocab):
    """Test that default rank is OOV_RANK."""
    lex = en_vocab["word"]