diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py index eaf318693..5487ada5a 100644 --- a/spacy/lang/am/stop_words.py +++ b/spacy/lang/am/stop_words.py @@ -4,30 +4,30 @@ STOP_WORDS = set( """ ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን -ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ -አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ -አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ -አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል ተገልጿል -ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን -ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል -ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች -እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ -ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ -ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ -ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም -በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና ይሆኑበታል በሆነ አንዱ -ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት -የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት ጋር ቢያንስ +ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ +አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ +አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ +አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል ተገልጿል +ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን +ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል +ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች +እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ +ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ +ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ +ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም +በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና ይሆኑበታል በሆነ አንዱ +ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት +የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት ጋር ቢያንስ ይህንንም እነደሆነ እነዚህን ይኸው የማናቸውም -በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ -የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ -ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም -የማይበልጥ እንደሆነና እንዲሆኑ በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን -ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ -ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ -በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና -በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም -ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ +በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ +የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ +ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም +የማይበልጥ እንደሆነና እንዲሆኑ በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን +ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ +ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ +በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና +በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም +ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም """.split() ) diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py index 33a16a09a..c136d0ab2 100644 --- a/spacy/lang/tn/lex_attrs.py +++ b/spacy/lang/tn/lex_attrs.py @@ -78,6 +78,7 @@ _ordinal_words = [ "bazillione", ] + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -98,7 +99,7 @@ def like_num(text): return True if text_lower.endswith("th"): if text_lower[:-2].isdigit(): - return True + return True return False diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py index 241ad39af..a52755564 100644 --- a/spacy/lang/tn/punctuation.py +++ b/spacy/lang/tn/punctuation.py @@ -16,4 +16,4 @@ _infixes = ( ) -TOKENIZER_INFIXES = _infixes \ No newline at end of file +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index a627ef362..f614771dd 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,18 +1,20 @@ # Stop words -STOP_WORDS = set(""" -ke gareng ga selekanyo tlhwatlhwa yo mongwe se +STOP_WORDS = set( + """ +ke gareng ga selekanyo tlhwatlhwa yo mongwe se sengwe fa go le jalo gongwe ba na mo tikologong -jaaka kwa morago nna gonne ka sa pele nako teng +jaaka kwa morago nna gonne ka sa pele nako teng tlase fela ntle magareng tsona feta bobedi kgabaganya moo gape kgatlhanong botlhe tsotlhe bokana e esi -setseng mororo dinako golo kgolo nnye wena gago -o ntse ntle tla goreng gangwe mang yotlhe gore -eo yona tseraganyo eng ne sentle re rona thata -godimo fitlha pedi masomamabedi lesomepedi mmogo -tharo tseo boraro tseno yone jaanong bobona bona -lesome tsaya tsamaiso nngwe masomethataro thataro +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi -bonala e tshwanang bogolo tsenya tsweetswee karolo -sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa -tlhano lesometlhano botlalo lekgolo -""".split()) +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split() +) diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index cf8baf9da..03c7db422 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -76,7 +76,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: retokenizes=True, ) def make_token_splitter( - nlp: Language, name: str, *, min_length=0, split_length=0, + nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 ): return TokenSplitter(min_length=min_length, split_length=split_length) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 230ca3b19..478949601 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -327,8 +327,10 @@ def test_phrase_matcher_sent_start(en_vocab, attr): def test_span_in_phrasematcher(en_vocab): """Ensure that PhraseMatcher accepts Span and Doc as input""" - doc = Doc(en_vocab, - words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) span = doc[:8] pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) matcher = PhraseMatcher(en_vocab) @@ -341,10 +343,14 @@ def test_span_in_phrasematcher(en_vocab): def test_span_v_doc_in_phrasematcher(en_vocab): """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc""" - doc = Doc(en_vocab, - words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", - "Spans", "and", "Docs", "in", "my", "matchers", "," - "and", "Spans", "and", "Docs", "everywhere" "."]) + # fmt: off + words = [ + "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans", + "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs", + "everywhere", "." + ] + # fmt: on + doc = Doc(en_vocab, words=words) span = doc[9:15] # second clause pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) matcher = PhraseMatcher(en_vocab) diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py new file mode 100644 index 000000000..3007f1dc6 --- /dev/null +++ b/spacy/tests/regression/test_issue6501-7000.py @@ -0,0 +1,229 @@ +import pytest +from spacy.lang.en import English +import numpy as np +import spacy +from spacy.tokens import Doc +from spacy.matcher import PhraseMatcher +from spacy.tokens import DocBin +from spacy.util import load_config_from_str +from spacy.training import Example +from spacy.training.initialize import init_nlp +import pickle + +from ..util import make_tempdir + + +def test_issue6730(en_vocab): + """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" + from spacy.kb import KnowledgeBase + + kb = KnowledgeBase(en_vocab, entity_vector_length=3) + kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) + + with pytest.raises(ValueError): + kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) + assert kb.contains_alias("") is False + + kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) + kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) + + with make_tempdir() as tmp_dir: + kb.to_disk(tmp_dir) + kb.from_disk(tmp_dir) + assert kb.get_size_aliases() == 2 + assert set(kb.get_alias_strings()) == {"x", "y"} + + +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,label", + [("Welcome to Mumbai, my friend", 11, 17, "GPE")], +) +def test_issue6815_1(sentence, start_idx, end_idx, label): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, label=label) + assert span.label_ == label + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] +) +def test_issue6815_2(sentence, start_idx, end_idx, kb_id): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) + assert span.kb_id == kb_id + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,vector", + [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], +) +def test_issue6815_3(sentence, start_idx, end_idx, vector): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, vector=vector) + assert (span.vector == vector).all() + + +def test_issue6839(en_vocab): + """Ensure that PhraseMatcher accepts Span as input""" + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches = matcher(span) + assert matches + + +CONFIG_ISSUE_6908 = """ +[paths] +train = "TRAIN_PLACEHOLDER" +raw = null +init_tok2vec = null +vectors = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["textcat"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 + +[components] + +[components.textcat] +factory = "TEXTCAT_PLACEHOLDER" + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +frozen_components = [] +before_to_disk = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.components.textcat] +labels = ['label1', 'label2'] + +[initialize.tokenizer] +""" + + +@pytest.mark.parametrize( + "component_name", ["textcat", "textcat_multilabel"], +) +def test_issue6908(component_name): + """Test intializing textcat with labels in a list""" + + def create_data(out_file): + nlp = spacy.blank("en") + doc = nlp.make_doc("Some text") + doc.cats = {"label1": 0, "label2": 1} + out_data = DocBin(docs=[doc]).to_bytes() + with out_file.open("wb") as file_: + file_.write(out_data) + + with make_tempdir() as tmp_path: + train_path = tmp_path / "train.spacy" + create_data(train_path) + config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) + config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) + config = load_config_from_str(config_str) + init_nlp(config) + + +CONFIG_ISSUE_6950 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue6950(): + """Test that the nlp object with initialized tok2vec with listeners pickles + correctly (and doesn't have lambdas). + """ + nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) + nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) + pickle.dumps(nlp) + nlp("hello") + pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue6730.py b/spacy/tests/regression/test_issue6730.py deleted file mode 100644 index 4c2979899..000000000 --- a/spacy/tests/regression/test_issue6730.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -from ..util import make_tempdir - - -def test_issue6730(en_vocab): - """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" - from spacy.kb import KnowledgeBase - - kb = KnowledgeBase(en_vocab, entity_vector_length=3) - kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) - - with pytest.raises(ValueError): - kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) - assert kb.contains_alias("") is False - - kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) - kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) - - with make_tempdir() as tmp_dir: - kb.to_disk(tmp_dir) - kb.from_disk(tmp_dir) - assert kb.get_size_aliases() == 2 - assert set(kb.get_alias_strings()) == {"x", "y"} diff --git a/spacy/tests/regression/test_issue6755.py b/spacy/tests/regression/test_issue6755.py deleted file mode 100644 index 15ddd6fbc..000000000 --- a/spacy/tests/regression/test_issue6755.py +++ /dev/null @@ -1,5 +0,0 @@ -def test_issue6755(en_tokenizer): - doc = en_tokenizer("This is a magnificent sentence.") - span = doc[:0] - assert span.text_with_ws == "" - assert span.text == "" diff --git a/spacy/tests/regression/test_issue6815.py b/spacy/tests/regression/test_issue6815.py deleted file mode 100644 index 7d523e00b..000000000 --- a/spacy/tests/regression/test_issue6815.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest -from spacy.lang.en import English -import numpy as np - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,label", - [("Welcome to Mumbai, my friend", 11, 17, "GPE")], -) -def test_char_span_label(sentence, start_idx, end_idx, label): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, label=label) - assert span.label_ == label - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] -) -def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) - assert span.kb_id == kb_id - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,vector", - [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], -) -def test_char_span_vector(sentence, start_idx, end_idx, vector): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, vector=vector) - assert (span.vector == vector).all() diff --git a/spacy/tests/regression/test_issue6839.py b/spacy/tests/regression/test_issue6839.py deleted file mode 100644 index 2148cf867..000000000 --- a/spacy/tests/regression/test_issue6839.py +++ /dev/null @@ -1,15 +0,0 @@ -from spacy.tokens import Doc -from spacy.matcher import PhraseMatcher - - -def test_span_in_phrasematcher(en_vocab): - """Ensure that PhraseMatcher accepts Span as input""" - doc = Doc(en_vocab, - words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) - span = doc[:8] - pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) - matcher = PhraseMatcher(en_vocab) - matcher.add("SPACY", [pattern]) - matches = matcher(span) - assert matches - diff --git a/spacy/tests/regression/test_issue6908.py b/spacy/tests/regression/test_issue6908.py deleted file mode 100644 index a12ae9e13..000000000 --- a/spacy/tests/regression/test_issue6908.py +++ /dev/null @@ -1,102 +0,0 @@ -import pytest -import spacy -from spacy.language import Language -from spacy.tokens import DocBin -from spacy import util -from spacy.schemas import ConfigSchemaInit - -from spacy.training.initialize import init_nlp - -from ..util import make_tempdir - -TEXTCAT_WITH_LABELS_ARRAY_CONFIG = """ -[paths] -train = "TRAIN_PLACEHOLDER" -raw = null -init_tok2vec = null -vectors = null - -[system] -seed = 0 -gpu_allocator = null - -[nlp] -lang = "en" -pipeline = ["textcat"] -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -batch_size = 1000 - -[components] - -[components.textcat] -factory = "TEXTCAT_PLACEHOLDER" - -[corpora] - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - - -[training] -train_corpus = "corpora.train" -dev_corpus = "corpora.dev" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -frozen_components = [] -before_to_disk = null - -[pretraining] - -[initialize] -vectors = ${paths.vectors} -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.components.textcat] -labels = ['label1', 'label2'] - -[initialize.tokenizer] -""" - - -@pytest.mark.parametrize( - "component_name", - ["textcat", "textcat_multilabel"], -) -def test_textcat_initialize_labels_validation(component_name): - """Test intializing textcat with labels in a list""" - - def create_data(out_file): - nlp = spacy.blank("en") - doc = nlp.make_doc("Some text") - doc.cats = {"label1": 0, "label2": 1} - - out_data = DocBin(docs=[doc]).to_bytes() - with out_file.open("wb") as file_: - file_.write(out_data) - - with make_tempdir() as tmp_path: - train_path = tmp_path / "train.spacy" - create_data(train_path) - - config_str = TEXTCAT_WITH_LABELS_ARRAY_CONFIG.replace( - "TEXTCAT_PLACEHOLDER", component_name - ) - config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) - - config = util.load_config_from_str(config_str) - init_nlp(config) diff --git a/spacy/tests/regression/test_issue6950.py b/spacy/tests/regression/test_issue6950.py deleted file mode 100644 index f9d75a4ff..000000000 --- a/spacy/tests/regression/test_issue6950.py +++ /dev/null @@ -1,59 +0,0 @@ -from spacy.lang.en import English -from spacy.training import Example -from spacy.util import load_config_from_str -import pickle - - -CONFIG = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.ner] -factory = "ner" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -def test_issue6950(): - """Test that the nlp object with initialized tok2vec with listeners pickles - correctly (and doesn't have lambdas). - """ - nlp = English.from_config(load_config_from_str(CONFIG)) - nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) - pickle.dumps(nlp) - nlp("hello") - pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index dcfb8d9e7..cee48522d 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -51,8 +51,7 @@ TRAIN_DATA = [ def test_issue7029(): - """Test that an empty document doesn't mess up an entire batch. - """ + """Test that an empty document doesn't mess up an entire batch.""" nlp = English.from_config(load_config_from_str(CONFIG)) train_examples = [] for t in TRAIN_DATA: diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index 4eeff5175..b6fee6628 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -57,6 +57,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab["dogs"].check_flag(is_len4) is True en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT) + def test_vocab_lexeme_oov_rank(en_vocab): """Test that default rank is OOV_RANK.""" lex = en_vocab["word"]