mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
Tidy up and auto-format
This commit is contained in:
parent
06e66d4ced
commit
9ba715ed16
|
@ -78,6 +78,7 @@ _ordinal_words = [
|
||||||
"bazillione",
|
"bazillione",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# Stop words
|
# Stop words
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
|
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
|
||||||
sengwe fa go le jalo gongwe ba na mo tikologong
|
sengwe fa go le jalo gongwe ba na mo tikologong
|
||||||
jaaka kwa morago nna gonne ka sa pele nako teng
|
jaaka kwa morago nna gonne ka sa pele nako teng
|
||||||
|
@ -15,4 +16,5 @@ tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
|
||||||
bonala e tshwanang bogolo tsenya tsweetswee karolo
|
bonala e tshwanang bogolo tsenya tsweetswee karolo
|
||||||
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
|
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
|
||||||
tlhano lesometlhano botlalo lekgolo
|
tlhano lesometlhano botlalo lekgolo
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -76,7 +76,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
retokenizes=True,
|
retokenizes=True,
|
||||||
)
|
)
|
||||||
def make_token_splitter(
|
def make_token_splitter(
|
||||||
nlp: Language, name: str, *, min_length=0, split_length=0,
|
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
|
||||||
):
|
):
|
||||||
return TokenSplitter(min_length=min_length, split_length=split_length)
|
return TokenSplitter(min_length=min_length, split_length=split_length)
|
||||||
|
|
||||||
|
|
|
@ -327,8 +327,10 @@ def test_phrase_matcher_sent_start(en_vocab, attr):
|
||||||
|
|
||||||
def test_span_in_phrasematcher(en_vocab):
|
def test_span_in_phrasematcher(en_vocab):
|
||||||
"""Ensure that PhraseMatcher accepts Span and Doc as input"""
|
"""Ensure that PhraseMatcher accepts Span and Doc as input"""
|
||||||
doc = Doc(en_vocab,
|
# fmt: off
|
||||||
words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
|
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
|
||||||
|
# fmt: on
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
span = doc[:8]
|
span = doc[:8]
|
||||||
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
@ -341,10 +343,14 @@ def test_span_in_phrasematcher(en_vocab):
|
||||||
|
|
||||||
def test_span_v_doc_in_phrasematcher(en_vocab):
|
def test_span_v_doc_in_phrasematcher(en_vocab):
|
||||||
"""Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
|
"""Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
|
||||||
doc = Doc(en_vocab,
|
# fmt: off
|
||||||
words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",",
|
words = [
|
||||||
"Spans", "and", "Docs", "in", "my", "matchers", ","
|
"I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans",
|
||||||
"and", "Spans", "and", "Docs", "everywhere" "."])
|
"and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs",
|
||||||
|
"everywhere", "."
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
span = doc[9:15] # second clause
|
span = doc[9:15] # second clause
|
||||||
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
|
229
spacy/tests/regression/test_issue6501-7000.py
Normal file
229
spacy/tests/regression/test_issue6501-7000.py
Normal file
|
@ -0,0 +1,229 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.en import English
|
||||||
|
import numpy as np
|
||||||
|
import spacy
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.matcher import PhraseMatcher
|
||||||
|
from spacy.tokens import DocBin
|
||||||
|
from spacy.util import load_config_from_str
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.training.initialize import init_nlp
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue6730(en_vocab):
|
||||||
|
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
||||||
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
|
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
|
||||||
|
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
|
||||||
|
assert kb.contains_alias("") is False
|
||||||
|
|
||||||
|
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
|
||||||
|
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
|
||||||
|
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
kb.to_disk(tmp_dir)
|
||||||
|
kb.from_disk(tmp_dir)
|
||||||
|
assert kb.get_size_aliases() == 2
|
||||||
|
assert set(kb.get_alias_strings()) == {"x", "y"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue6755(en_tokenizer):
|
||||||
|
doc = en_tokenizer("This is a magnificent sentence.")
|
||||||
|
span = doc[:0]
|
||||||
|
assert span.text_with_ws == ""
|
||||||
|
assert span.text == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sentence, start_idx,end_idx,label",
|
||||||
|
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
|
||||||
|
)
|
||||||
|
def test_issue6815_1(sentence, start_idx, end_idx, label):
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(sentence)
|
||||||
|
span = doc[:].char_span(start_idx, end_idx, label=label)
|
||||||
|
assert span.label_ == label
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
|
||||||
|
)
|
||||||
|
def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(sentence)
|
||||||
|
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
|
||||||
|
assert span.kb_id == kb_id
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sentence, start_idx,end_idx,vector",
|
||||||
|
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
|
||||||
|
)
|
||||||
|
def test_issue6815_3(sentence, start_idx, end_idx, vector):
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(sentence)
|
||||||
|
span = doc[:].char_span(start_idx, end_idx, vector=vector)
|
||||||
|
assert (span.vector == vector).all()
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue6839(en_vocab):
|
||||||
|
"""Ensure that PhraseMatcher accepts Span as input"""
|
||||||
|
# fmt: off
|
||||||
|
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
|
||||||
|
# fmt: on
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
span = doc[:8]
|
||||||
|
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("SPACY", [pattern])
|
||||||
|
matches = matcher(span)
|
||||||
|
assert matches
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG_ISSUE_6908 = """
|
||||||
|
[paths]
|
||||||
|
train = "TRAIN_PLACEHOLDER"
|
||||||
|
raw = null
|
||||||
|
init_tok2vec = null
|
||||||
|
vectors = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
seed = 0
|
||||||
|
gpu_allocator = null
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["textcat"]
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
batch_size = 1000
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.textcat]
|
||||||
|
factory = "TEXTCAT_PLACEHOLDER"
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:train}
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:train}
|
||||||
|
|
||||||
|
|
||||||
|
[training]
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
frozen_components = []
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.components.textcat]
|
||||||
|
labels = ['label1', 'label2']
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"component_name", ["textcat", "textcat_multilabel"],
|
||||||
|
)
|
||||||
|
def test_issue6908(component_name):
|
||||||
|
"""Test intializing textcat with labels in a list"""
|
||||||
|
|
||||||
|
def create_data(out_file):
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp.make_doc("Some text")
|
||||||
|
doc.cats = {"label1": 0, "label2": 1}
|
||||||
|
out_data = DocBin(docs=[doc]).to_bytes()
|
||||||
|
with out_file.open("wb") as file_:
|
||||||
|
file_.write(out_data)
|
||||||
|
|
||||||
|
with make_tempdir() as tmp_path:
|
||||||
|
train_path = tmp_path / "train.spacy"
|
||||||
|
create_data(train_path)
|
||||||
|
config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
|
||||||
|
config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
|
||||||
|
config = load_config_from_str(config_str)
|
||||||
|
init_nlp(config)
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG_ISSUE_6950 = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
||||||
|
rows = [5000,2500,2500,2500]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
upstream = "*"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue6950():
|
||||||
|
"""Test that the nlp object with initialized tok2vec with listeners pickles
|
||||||
|
correctly (and doesn't have lambdas).
|
||||||
|
"""
|
||||||
|
nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
|
||||||
|
nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
|
||||||
|
pickle.dumps(nlp)
|
||||||
|
nlp("hello")
|
||||||
|
pickle.dumps(nlp)
|
|
@ -1,23 +0,0 @@
|
||||||
import pytest
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue6730(en_vocab):
|
|
||||||
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
|
|
||||||
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
|
|
||||||
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
|
|
||||||
assert kb.contains_alias("") is False
|
|
||||||
|
|
||||||
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
|
|
||||||
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
|
|
||||||
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
kb.to_disk(tmp_dir)
|
|
||||||
kb.from_disk(tmp_dir)
|
|
||||||
assert kb.get_size_aliases() == 2
|
|
||||||
assert set(kb.get_alias_strings()) == {"x", "y"}
|
|
|
@ -1,5 +0,0 @@
|
||||||
def test_issue6755(en_tokenizer):
|
|
||||||
doc = en_tokenizer("This is a magnificent sentence.")
|
|
||||||
span = doc[:0]
|
|
||||||
assert span.text_with_ws == ""
|
|
||||||
assert span.text == ""
|
|
|
@ -1,35 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence, start_idx,end_idx,label",
|
|
||||||
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
|
|
||||||
)
|
|
||||||
def test_char_span_label(sentence, start_idx, end_idx, label):
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(sentence)
|
|
||||||
span = doc[:].char_span(start_idx, end_idx, label=label)
|
|
||||||
assert span.label_ == label
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
|
|
||||||
)
|
|
||||||
def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(sentence)
|
|
||||||
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
|
|
||||||
assert span.kb_id == kb_id
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence, start_idx,end_idx,vector",
|
|
||||||
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
|
|
||||||
)
|
|
||||||
def test_char_span_vector(sentence, start_idx, end_idx, vector):
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(sentence)
|
|
||||||
span = doc[:].char_span(start_idx, end_idx, vector=vector)
|
|
||||||
assert (span.vector == vector).all()
|
|
|
@ -1,15 +0,0 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
|
|
||||||
|
|
||||||
def test_span_in_phrasematcher(en_vocab):
|
|
||||||
"""Ensure that PhraseMatcher accepts Span as input"""
|
|
||||||
doc = Doc(en_vocab,
|
|
||||||
words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
|
|
||||||
span = doc[:8]
|
|
||||||
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
|
||||||
matcher.add("SPACY", [pattern])
|
|
||||||
matches = matcher(span)
|
|
||||||
assert matches
|
|
||||||
|
|
|
@ -1,102 +0,0 @@
|
||||||
import pytest
|
|
||||||
import spacy
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.tokens import DocBin
|
|
||||||
from spacy import util
|
|
||||||
from spacy.schemas import ConfigSchemaInit
|
|
||||||
|
|
||||||
from spacy.training.initialize import init_nlp
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
TEXTCAT_WITH_LABELS_ARRAY_CONFIG = """
|
|
||||||
[paths]
|
|
||||||
train = "TRAIN_PLACEHOLDER"
|
|
||||||
raw = null
|
|
||||||
init_tok2vec = null
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
gpu_allocator = null
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
pipeline = ["textcat"]
|
|
||||||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
|
||||||
disabled = []
|
|
||||||
before_creation = null
|
|
||||||
after_creation = null
|
|
||||||
after_pipeline_creation = null
|
|
||||||
batch_size = 1000
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.textcat]
|
|
||||||
factory = "TEXTCAT_PLACEHOLDER"
|
|
||||||
|
|
||||||
[corpora]
|
|
||||||
|
|
||||||
[corpora.train]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
|
|
||||||
[corpora.dev]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
|
|
||||||
|
|
||||||
[training]
|
|
||||||
train_corpus = "corpora.train"
|
|
||||||
dev_corpus = "corpora.dev"
|
|
||||||
seed = ${system.seed}
|
|
||||||
gpu_allocator = ${system.gpu_allocator}
|
|
||||||
frozen_components = []
|
|
||||||
before_to_disk = null
|
|
||||||
|
|
||||||
[pretraining]
|
|
||||||
|
|
||||||
[initialize]
|
|
||||||
vectors = ${paths.vectors}
|
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
|
||||||
vocab_data = null
|
|
||||||
lookups = null
|
|
||||||
before_init = null
|
|
||||||
after_init = null
|
|
||||||
|
|
||||||
[initialize.components]
|
|
||||||
|
|
||||||
[initialize.components.textcat]
|
|
||||||
labels = ['label1', 'label2']
|
|
||||||
|
|
||||||
[initialize.tokenizer]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"component_name",
|
|
||||||
["textcat", "textcat_multilabel"],
|
|
||||||
)
|
|
||||||
def test_textcat_initialize_labels_validation(component_name):
|
|
||||||
"""Test intializing textcat with labels in a list"""
|
|
||||||
|
|
||||||
def create_data(out_file):
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
doc = nlp.make_doc("Some text")
|
|
||||||
doc.cats = {"label1": 0, "label2": 1}
|
|
||||||
|
|
||||||
out_data = DocBin(docs=[doc]).to_bytes()
|
|
||||||
with out_file.open("wb") as file_:
|
|
||||||
file_.write(out_data)
|
|
||||||
|
|
||||||
with make_tempdir() as tmp_path:
|
|
||||||
train_path = tmp_path / "train.spacy"
|
|
||||||
create_data(train_path)
|
|
||||||
|
|
||||||
config_str = TEXTCAT_WITH_LABELS_ARRAY_CONFIG.replace(
|
|
||||||
"TEXTCAT_PLACEHOLDER", component_name
|
|
||||||
)
|
|
||||||
config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
|
|
||||||
|
|
||||||
config = util.load_config_from_str(config_str)
|
|
||||||
init_nlp(config)
|
|
|
@ -1,59 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.training import Example
|
|
||||||
from spacy.util import load_config_from_str
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
CONFIG = """
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
pipeline = ["tok2vec", "tagger"]
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
|
||||||
rows = [5000,2500,2500,2500]
|
|
||||||
include_static_vectors = false
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
upstream = "*"
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue6950():
|
|
||||||
"""Test that the nlp object with initialized tok2vec with listeners pickles
|
|
||||||
correctly (and doesn't have lambdas).
|
|
||||||
"""
|
|
||||||
nlp = English.from_config(load_config_from_str(CONFIG))
|
|
||||||
nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
|
|
||||||
pickle.dumps(nlp)
|
|
||||||
nlp("hello")
|
|
||||||
pickle.dumps(nlp)
|
|
|
@ -51,8 +51,7 @@ TRAIN_DATA = [
|
||||||
|
|
||||||
|
|
||||||
def test_issue7029():
|
def test_issue7029():
|
||||||
"""Test that an empty document doesn't mess up an entire batch.
|
"""Test that an empty document doesn't mess up an entire batch."""
|
||||||
"""
|
|
||||||
nlp = English.from_config(load_config_from_str(CONFIG))
|
nlp = English.from_config(load_config_from_str(CONFIG))
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
|
|
|
@ -57,6 +57,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
|
||||||
assert en_vocab["dogs"].check_flag(is_len4) is True
|
assert en_vocab["dogs"].check_flag(is_len4) is True
|
||||||
en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT)
|
en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT)
|
||||||
|
|
||||||
|
|
||||||
def test_vocab_lexeme_oov_rank(en_vocab):
|
def test_vocab_lexeme_oov_rank(en_vocab):
|
||||||
"""Test that default rank is OOV_RANK."""
|
"""Test that default rank is OOV_RANK."""
|
||||||
lex = en_vocab["word"]
|
lex = en_vocab["word"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user