Tidy up and auto-format

2025-11-01 08:27:44 +03:00 · 2021-02-13 12:55:56 +11:00 · 2021-02-13 12:55:56 +11:00 · 9ba715ed16
commit 9ba715ed16
parent 06e66d4ced
15 changed files with 285 additions and 286 deletions
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@ -78,6 +78,7 @@ _ordinal_words = [
    "bazillione",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@ -1,5 +1,6 @@
 # Stop words
-STOP_WORDS = set("""
+STOP_WORDS = set(
    """
 ke gareng ga selekanyo tlhwatlhwa yo mongwe se
 sengwe fa go le jalo gongwe ba na mo tikologong
 jaaka kwa morago nna gonne ka sa pele nako teng
@ -15,4 +16,5 @@ tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
 bonala e tshwanang bogolo tsenya tsweetswee karolo
 sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
 tlhano lesometlhano botlalo lekgolo
-""".split())
+""".split()
 )
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -76,7 +76,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
    retokenizes=True,
 )
 def make_token_splitter(
-    nlp: Language, name: str, *, min_length=0, split_length=0,
+    nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
 ):
    return TokenSplitter(min_length=min_length, split_length=split_length)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -327,8 +327,10 @@ def test_phrase_matcher_sent_start(en_vocab, attr):
 def test_span_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher accepts Span and Doc as input"""
-    doc = Doc(en_vocab,
+    # fmt: off
-              words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
+    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    span = doc[:8]
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
@ -341,10 +343,14 @@ def test_span_in_phrasematcher(en_vocab):
 def test_span_v_doc_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
-    doc = Doc(en_vocab,
+    # fmt: off
-              words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", 
+    words = [
-                    "Spans", "and", "Docs", "in", "my", "matchers", ","
+        "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans",
-                    "and", "Spans", "and", "Docs", "everywhere" "."])
+        "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs",
        "everywhere", "."
    ]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    span = doc[9:15]  # second clause
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
--- a/spacy/tests/regression/test_issue6501-7000.py
+++ b/spacy/tests/regression/test_issue6501-7000.py
@ -0,0 +1,229 @@
 import pytest
 from spacy.lang.en import English
 import numpy as np
 import spacy
 from spacy.tokens import Doc
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import DocBin
 from spacy.util import load_config_from_str
 from spacy.training import Example
 from spacy.training.initialize import init_nlp
 import pickle
 from ..util import make_tempdir
 def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase
    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False
    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}
 def test_issue6755(en_tokenizer):
    doc = en_tokenizer("This is a magnificent sentence.")
    span = doc[:0]
    assert span.text_with_ws == ""
    assert span.text == ""
@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,label",
    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
 )
 def test_issue6815_1(sentence, start_idx, end_idx, label):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, label=label)
    assert span.label_ == label
@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
 )
 def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
    assert span.kb_id == kb_id
@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,vector",
    [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
 )
 def test_issue6815_3(sentence, start_idx, end_idx, vector):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, vector=vector)
    assert (span.vector == vector).all()
 def test_issue6839(en_vocab):
    """Ensure that PhraseMatcher accepts Span as input"""
    # fmt: off
    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    span = doc[:8]
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("SPACY", [pattern])
    matches = matcher(span)
    assert matches
 CONFIG_ISSUE_6908 = """
 [paths]
 train = "TRAIN_PLACEHOLDER"
 raw = null
 init_tok2vec = null
 vectors = null
 [system]
 seed = 0
 gpu_allocator = null
 [nlp]
 lang = "en"
 pipeline = ["textcat"]
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 batch_size = 1000
 [components]
 [components.textcat]
 factory = "TEXTCAT_PLACEHOLDER"
 [corpora]
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths:train}
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths:train}
 [training]
 train_corpus = "corpora.train"
 dev_corpus = "corpora.dev"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 frozen_components = []
 before_to_disk = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.components.textcat]
 labels = ['label1', 'label2']
 [initialize.tokenizer]
 """
@pytest.mark.parametrize(
    "component_name", ["textcat", "textcat_multilabel"],
 )
 def test_issue6908(component_name):
    """Test intializing textcat with labels in a list"""
    def create_data(out_file):
        nlp = spacy.blank("en")
        doc = nlp.make_doc("Some text")
        doc.cats = {"label1": 0, "label2": 1}
        out_data = DocBin(docs=[doc]).to_bytes()
        with out_file.open("wb") as file_:
            file_.write(out_data)
    with make_tempdir() as tmp_path:
        train_path = tmp_path / "train.spacy"
        create_data(train_path)
        config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
        config = load_config_from_str(config_str)
        init_nlp(config)
 CONFIG_ISSUE_6950 = """
 [nlp]
 lang = "en"
 pipeline = ["tok2vec", "tagger"]
 [components]
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
 width = ${components.tok2vec.model.encode:width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,2500,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [components.ner]
 factory = "ner"
 [components.tagger]
 factory = "tagger"
 [components.tagger.model]
@architectures = "spacy.Tagger.v1"
 nO = null
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode:width}
 upstream = "*"
 """
 def test_issue6950():
    """Test that the nlp object with initialized tok2vec with listeners pickles
    correctly (and doesn't have lambdas).
    """
    nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
    pickle.dumps(nlp)
    nlp("hello")
    pickle.dumps(nlp)
--- a/spacy/tests/regression/test_issue6730.py
+++ b/spacy/tests/regression/test_issue6730.py
@ -1,23 +0,0 @@
 import pytest
 from ..util import make_tempdir
 def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase
    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False
    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}
--- a/spacy/tests/regression/test_issue6755.py
+++ b/spacy/tests/regression/test_issue6755.py
@ -1,5 +0,0 @@
 def test_issue6755(en_tokenizer):
    doc = en_tokenizer("This is a magnificent sentence.")
    span = doc[:0]
    assert span.text_with_ws == ""
    assert span.text == ""
--- a/spacy/tests/regression/test_issue6815.py
+++ b/spacy/tests/regression/test_issue6815.py
@ -1,35 +0,0 @@
 import pytest
 from spacy.lang.en import English
 import numpy as np
@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,label",
    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
 )
 def test_char_span_label(sentence, start_idx, end_idx, label):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, label=label)
    assert span.label_ == label
@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
 )
 def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
    assert span.kb_id == kb_id
@pytest.mark.parametrize(
    "sentence, start_idx,end_idx,vector",
    [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
 )
 def test_char_span_vector(sentence, start_idx, end_idx, vector):
    nlp = English()
    doc = nlp(sentence)
    span = doc[:].char_span(start_idx, end_idx, vector=vector)
    assert (span.vector == vector).all()
--- a/spacy/tests/regression/test_issue6839.py
+++ b/spacy/tests/regression/test_issue6839.py
@ -1,15 +0,0 @@
 from spacy.tokens import Doc
 from spacy.matcher import PhraseMatcher
 def test_span_in_phrasematcher(en_vocab):
    """Ensure that PhraseMatcher accepts Span as input"""
    doc = Doc(en_vocab,
              words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
    span = doc[:8]
    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("SPACY", [pattern])
    matches = matcher(span)
    assert matches
--- a/spacy/tests/regression/test_issue6908.py
+++ b/spacy/tests/regression/test_issue6908.py
@ -1,102 +0,0 @@
 import pytest
 import spacy
 from spacy.language import Language
 from spacy.tokens import DocBin
 from spacy import util
 from spacy.schemas import ConfigSchemaInit
 from spacy.training.initialize import init_nlp
 from ..util import make_tempdir
 TEXTCAT_WITH_LABELS_ARRAY_CONFIG = """
 [paths]
 train = "TRAIN_PLACEHOLDER"
 raw = null
 init_tok2vec = null
 vectors = null
 [system]
 seed = 0
 gpu_allocator = null
 [nlp]
 lang = "en"
 pipeline = ["textcat"]
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 batch_size = 1000
 [components]
 [components.textcat]
 factory = "TEXTCAT_PLACEHOLDER"
 [corpora]
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths:train}
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths:train}
 [training]
 train_corpus = "corpora.train"
 dev_corpus = "corpora.dev"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 frozen_components = []
 before_to_disk = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.components.textcat]
 labels = ['label1', 'label2']
 [initialize.tokenizer]
 """
@pytest.mark.parametrize(
    "component_name",
    ["textcat", "textcat_multilabel"],
 )
 def test_textcat_initialize_labels_validation(component_name):
    """Test intializing textcat with labels in a list"""
    def create_data(out_file):
        nlp = spacy.blank("en")
        doc = nlp.make_doc("Some text")
        doc.cats = {"label1": 0, "label2": 1}
        out_data = DocBin(docs=[doc]).to_bytes()
        with out_file.open("wb") as file_:
            file_.write(out_data)
    with make_tempdir() as tmp_path:
        train_path = tmp_path / "train.spacy"
        create_data(train_path)
        config_str = TEXTCAT_WITH_LABELS_ARRAY_CONFIG.replace(
            "TEXTCAT_PLACEHOLDER", component_name
        )
        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
        config = util.load_config_from_str(config_str)
        init_nlp(config)
--- a/spacy/tests/regression/test_issue6950.py
+++ b/spacy/tests/regression/test_issue6950.py
@ -1,59 +0,0 @@
 from spacy.lang.en import English
 from spacy.training import Example
 from spacy.util import load_config_from_str
 import pickle
 CONFIG = """
 [nlp]
 lang = "en"
 pipeline = ["tok2vec", "tagger"]
 [components]
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
 width = ${components.tok2vec.model.encode:width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,2500,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [components.ner]
 factory = "ner"
 [components.tagger]
 factory = "tagger"
 [components.tagger.model]
@architectures = "spacy.Tagger.v1"
 nO = null
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode:width}
 upstream = "*"
 """
 def test_issue6950():
    """Test that the nlp object with initialized tok2vec with listeners pickles
    correctly (and doesn't have lambdas).
    """
    nlp = English.from_config(load_config_from_str(CONFIG))
    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
    pickle.dumps(nlp)
    nlp("hello")
    pickle.dumps(nlp)
--- a/spacy/tests/regression/test_issue7029.py
+++ b/spacy/tests/regression/test_issue7029.py
@ -51,8 +51,7 @@ TRAIN_DATA = [
 def test_issue7029():
-    """Test that an empty document doesn't mess up an entire batch.
+    """Test that an empty document doesn't mess up an entire batch."""
    """
    nlp = English.from_config(load_config_from_str(CONFIG))
    train_examples = []
    for t in TRAIN_DATA:
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@ -57,6 +57,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
    assert en_vocab["dogs"].check_flag(is_len4) is True
    en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT)
 def test_vocab_lexeme_oov_rank(en_vocab):
    """Test that default rank is OOV_RANK."""
    lex = en_vocab["word"]