Tidy up and auto-format

This commit is contained in:
Ines Montani 2021-02-13 12:55:56 +11:00
parent 06e66d4ced
commit 9ba715ed16
15 changed files with 285 additions and 286 deletions

View File

@ -78,6 +78,7 @@ _ordinal_words = [
"bazillione", "bazillione",
] ]
def like_num(text): def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]

View File

@ -1,5 +1,6 @@
# Stop words # Stop words
STOP_WORDS = set(""" STOP_WORDS = set(
"""
ke gareng ga selekanyo tlhwatlhwa yo mongwe se ke gareng ga selekanyo tlhwatlhwa yo mongwe se
sengwe fa go le jalo gongwe ba na mo tikologong sengwe fa go le jalo gongwe ba na mo tikologong
jaaka kwa morago nna gonne ka sa pele nako teng jaaka kwa morago nna gonne ka sa pele nako teng
@ -15,4 +16,5 @@ tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
bonala e tshwanang bogolo tsenya tsweetswee karolo bonala e tshwanang bogolo tsenya tsweetswee karolo
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
tlhano lesometlhano botlalo lekgolo tlhano lesometlhano botlalo lekgolo
""".split()) """.split()
)

View File

@ -76,7 +76,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
retokenizes=True, retokenizes=True,
) )
def make_token_splitter( def make_token_splitter(
nlp: Language, name: str, *, min_length=0, split_length=0, nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
): ):
return TokenSplitter(min_length=min_length, split_length=split_length) return TokenSplitter(min_length=min_length, split_length=split_length)

View File

@ -327,8 +327,10 @@ def test_phrase_matcher_sent_start(en_vocab, attr):
def test_span_in_phrasematcher(en_vocab): def test_span_in_phrasematcher(en_vocab):
"""Ensure that PhraseMatcher accepts Span and Doc as input""" """Ensure that PhraseMatcher accepts Span and Doc as input"""
doc = Doc(en_vocab, # fmt: off
words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
# fmt: on
doc = Doc(en_vocab, words=words)
span = doc[:8] span = doc[:8]
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
@ -341,10 +343,14 @@ def test_span_in_phrasematcher(en_vocab):
def test_span_v_doc_in_phrasematcher(en_vocab): def test_span_v_doc_in_phrasematcher(en_vocab):
"""Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc""" """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
doc = Doc(en_vocab, # fmt: off
words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", words = [
"Spans", "and", "Docs", "in", "my", "matchers", "," "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans",
"and", "Spans", "and", "Docs", "everywhere" "."]) "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs",
"everywhere", "."
]
# fmt: on
doc = Doc(en_vocab, words=words)
span = doc[9:15] # second clause span = doc[9:15] # second clause
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)

View File

@ -0,0 +1,229 @@
import pytest
from spacy.lang.en import English
import numpy as np
import spacy
from spacy.tokens import Doc
from spacy.matcher import PhraseMatcher
from spacy.tokens import DocBin
from spacy.util import load_config_from_str
from spacy.training import Example
from spacy.training.initialize import init_nlp
import pickle
from ..util import make_tempdir
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
assert kb.contains_alias("") is False
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
with make_tempdir() as tmp_dir:
kb.to_disk(tmp_dir)
kb.from_disk(tmp_dir)
assert kb.get_size_aliases() == 2
assert set(kb.get_alias_strings()) == {"x", "y"}
def test_issue6755(en_tokenizer):
doc = en_tokenizer("This is a magnificent sentence.")
span = doc[:0]
assert span.text_with_ws == ""
assert span.text == ""
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,label",
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
)
def test_issue6815_1(sentence, start_idx, end_idx, label):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, label=label)
assert span.label_ == label
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
)
def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
assert span.kb_id == kb_id
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,vector",
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
)
def test_issue6815_3(sentence, start_idx, end_idx, vector):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, vector=vector)
assert (span.vector == vector).all()
def test_issue6839(en_vocab):
"""Ensure that PhraseMatcher accepts Span as input"""
# fmt: off
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
# fmt: on
doc = Doc(en_vocab, words=words)
span = doc[:8]
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab)
matcher.add("SPACY", [pattern])
matches = matcher(span)
assert matches
CONFIG_ISSUE_6908 = """
[paths]
train = "TRAIN_PLACEHOLDER"
raw = null
init_tok2vec = null
vectors = null
[system]
seed = 0
gpu_allocator = null
[nlp]
lang = "en"
pipeline = ["textcat"]
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000
[components]
[components.textcat]
factory = "TEXTCAT_PLACEHOLDER"
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
frozen_components = []
before_to_disk = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.components.textcat]
labels = ['label1', 'label2']
[initialize.tokenizer]
"""
@pytest.mark.parametrize(
"component_name", ["textcat", "textcat_multilabel"],
)
def test_issue6908(component_name):
"""Test intializing textcat with labels in a list"""
def create_data(out_file):
nlp = spacy.blank("en")
doc = nlp.make_doc("Some text")
doc.cats = {"label1": 0, "label2": 1}
out_data = DocBin(docs=[doc]).to_bytes()
with out_file.open("wb") as file_:
file_.write(out_data)
with make_tempdir() as tmp_path:
train_path = tmp_path / "train.spacy"
create_data(train_path)
config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
config = load_config_from_str(config_str)
init_nlp(config)
CONFIG_ISSUE_6950 = """
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[components.ner]
factory = "ner"
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
"""
def test_issue6950():
"""Test that the nlp object with initialized tok2vec with listeners pickles
correctly (and doesn't have lambdas).
"""
nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
pickle.dumps(nlp)
nlp("hello")
pickle.dumps(nlp)

View File

@ -1,23 +0,0 @@
import pytest
from ..util import make_tempdir
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
assert kb.contains_alias("") is False
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
with make_tempdir() as tmp_dir:
kb.to_disk(tmp_dir)
kb.from_disk(tmp_dir)
assert kb.get_size_aliases() == 2
assert set(kb.get_alias_strings()) == {"x", "y"}

View File

@ -1,5 +0,0 @@
def test_issue6755(en_tokenizer):
doc = en_tokenizer("This is a magnificent sentence.")
span = doc[:0]
assert span.text_with_ws == ""
assert span.text == ""

View File

@ -1,35 +0,0 @@
import pytest
from spacy.lang.en import English
import numpy as np
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,label",
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
)
def test_char_span_label(sentence, start_idx, end_idx, label):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, label=label)
assert span.label_ == label
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
)
def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
assert span.kb_id == kb_id
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,vector",
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
)
def test_char_span_vector(sentence, start_idx, end_idx, vector):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, vector=vector)
assert (span.vector == vector).all()

View File

@ -1,15 +0,0 @@
from spacy.tokens import Doc
from spacy.matcher import PhraseMatcher
def test_span_in_phrasematcher(en_vocab):
"""Ensure that PhraseMatcher accepts Span as input"""
doc = Doc(en_vocab,
words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."])
span = doc[:8]
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
matcher = PhraseMatcher(en_vocab)
matcher.add("SPACY", [pattern])
matches = matcher(span)
assert matches

View File

@ -1,102 +0,0 @@
import pytest
import spacy
from spacy.language import Language
from spacy.tokens import DocBin
from spacy import util
from spacy.schemas import ConfigSchemaInit
from spacy.training.initialize import init_nlp
from ..util import make_tempdir
TEXTCAT_WITH_LABELS_ARRAY_CONFIG = """
[paths]
train = "TRAIN_PLACEHOLDER"
raw = null
init_tok2vec = null
vectors = null
[system]
seed = 0
gpu_allocator = null
[nlp]
lang = "en"
pipeline = ["textcat"]
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000
[components]
[components.textcat]
factory = "TEXTCAT_PLACEHOLDER"
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
frozen_components = []
before_to_disk = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.components.textcat]
labels = ['label1', 'label2']
[initialize.tokenizer]
"""
@pytest.mark.parametrize(
"component_name",
["textcat", "textcat_multilabel"],
)
def test_textcat_initialize_labels_validation(component_name):
"""Test intializing textcat with labels in a list"""
def create_data(out_file):
nlp = spacy.blank("en")
doc = nlp.make_doc("Some text")
doc.cats = {"label1": 0, "label2": 1}
out_data = DocBin(docs=[doc]).to_bytes()
with out_file.open("wb") as file_:
file_.write(out_data)
with make_tempdir() as tmp_path:
train_path = tmp_path / "train.spacy"
create_data(train_path)
config_str = TEXTCAT_WITH_LABELS_ARRAY_CONFIG.replace(
"TEXTCAT_PLACEHOLDER", component_name
)
config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
config = util.load_config_from_str(config_str)
init_nlp(config)

View File

@ -1,59 +0,0 @@
from spacy.lang.en import English
from spacy.training import Example
from spacy.util import load_config_from_str
import pickle
CONFIG = """
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[components.ner]
factory = "ner"
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
"""
def test_issue6950():
"""Test that the nlp object with initialized tok2vec with listeners pickles
correctly (and doesn't have lambdas).
"""
nlp = English.from_config(load_config_from_str(CONFIG))
nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
pickle.dumps(nlp)
nlp("hello")
pickle.dumps(nlp)

View File

@ -51,8 +51,7 @@ TRAIN_DATA = [
def test_issue7029(): def test_issue7029():
"""Test that an empty document doesn't mess up an entire batch. """Test that an empty document doesn't mess up an entire batch."""
"""
nlp = English.from_config(load_config_from_str(CONFIG)) nlp = English.from_config(load_config_from_str(CONFIG))
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:

View File

@ -57,6 +57,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
assert en_vocab["dogs"].check_flag(is_len4) is True assert en_vocab["dogs"].check_flag(is_len4) is True
en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT) en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT)
def test_vocab_lexeme_oov_rank(en_vocab): def test_vocab_lexeme_oov_rank(en_vocab):
"""Test that default rank is OOV_RANK.""" """Test that default rank is OOV_RANK."""
lex = en_vocab["word"] lex = en_vocab["word"]