mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Tidy up and auto-format
This commit is contained in:
parent
604be54a5c
commit
fa47f87924
|
@ -9,7 +9,8 @@ import sys
|
|||
from ._util import app, Arg, Opt
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
from ..training.converters import conllu_to_docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
|
|
|
@ -27,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
|||
|
||||
|
||||
@debug_cli.command(
|
||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
@app.command(
|
||||
"debug-data",
|
||||
|
|
|
@ -134,7 +134,7 @@ def update_dvc_config(
|
|||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
|
|
|
@ -3,8 +3,7 @@ from ...tokens import Token
|
|||
|
||||
|
||||
class EnglishLemmatizer(Lemmatizer):
|
||||
"""English lemmatizer. Only overrides is_base_form.
|
||||
"""
|
||||
"""English lemmatizer. Only overrides is_base_form."""
|
||||
|
||||
def is_base_form(self, token: Token) -> bool:
|
||||
"""
|
||||
|
|
|
@ -58,7 +58,7 @@ def noun_bounds(
|
|||
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||
)
|
||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
||||
if list(filter(filter_func, doc[left_bound.i : right.i])):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
|
|
|
@ -108,8 +108,8 @@ _num_words = [
|
|||
|
||||
def like_num(text):
|
||||
"""
|
||||
Check if text resembles a number
|
||||
"""
|
||||
Check if text resembles a number
|
||||
"""
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
|
|
|
@ -17,7 +17,7 @@ use_pyvi = true
|
|||
|
||||
|
||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||
|
||||
|
|
|
@ -1189,7 +1189,7 @@ class Language:
|
|||
# These are the settings provided in the [initialize] block in the config
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
init_vocab(
|
||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
|
||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||
)
|
||||
pretrain_cfg = config.get("pretraining")
|
||||
if pretrain_cfg:
|
||||
|
|
|
@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
|||
|
||||
|
||||
def analyze_pipes(
|
||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
|
||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||
a table with the pipeline components and why they assign and require, as
|
||||
|
|
|
@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
|
|||
matches = self.matcher(doc, allow_missing=True)
|
||||
# Sort by the attribute ID, so that later rules have precendence
|
||||
matches = [
|
||||
(int(self.vocab.strings[m_id]), m_id, s, e)
|
||||
for m_id, s, e in matches
|
||||
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
||||
]
|
||||
matches.sort()
|
||||
for attr_id, match_id, start, end in matches:
|
||||
|
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
|
|||
try:
|
||||
# The index can be negative, which makes it annoying to do
|
||||
# the boundscheck. Let Span do it instead.
|
||||
token = span[index]
|
||||
token = span[index] # noqa: F841
|
||||
except IndexError:
|
||||
# The original exception is just our conditional logic, so we
|
||||
# raise from.
|
||||
|
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
|
|||
span=[t.text for t in span],
|
||||
index=index,
|
||||
)
|
||||
) from None
|
||||
) from None
|
||||
set_token_attrs(span[index], attrs)
|
||||
return doc
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ class Lemmatizer(Pipe):
|
|||
return {}
|
||||
|
||||
@classmethod
|
||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
|
||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
|
||||
"""Load and validate lookups tables. If the provided lookups is None,
|
||||
load the default lookups tables according to the language and mode
|
||||
settings. Confirm that all required tables for the language and mode
|
||||
|
|
|
@ -347,7 +347,7 @@ class TextCategorizer(Pipe):
|
|||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
labels: Optional[Dict] = None
|
||||
labels: Optional[Dict] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
|
|
@ -132,7 +132,7 @@ def validate_init_settings(
|
|||
block = "initialize" if not section else f"initialize.{section}"
|
||||
title = f"Error validating initialization settings in [{block}]"
|
||||
raise ConfigValidationError(
|
||||
title=title, errors=e.errors(), config=settings, parent=name,
|
||||
title=title, errors=e.errors(), config=settings, parent=name
|
||||
) from None
|
||||
|
||||
|
||||
|
|
|
@ -32,9 +32,7 @@ class PRFScore:
|
|||
|
||||
def __add__(self, other):
|
||||
return PRFScore(
|
||||
tp=self.tp+other.tp,
|
||||
fp=self.fp+other.fp,
|
||||
fn=self.fn+other.fn
|
||||
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
||||
)
|
||||
|
||||
def score_set(self, cand: set, gold: set) -> None:
|
||||
|
@ -485,7 +483,7 @@ class Scorer:
|
|||
(pred_ent.start_char, pred_ent.end_char), None
|
||||
)
|
||||
label = gold_span.label_
|
||||
if not label in f_per_type:
|
||||
if label not in f_per_type:
|
||||
f_per_type[label] = PRFScore()
|
||||
gold = gold_span.kb_id_
|
||||
# only evaluating entities that overlap between gold and pred,
|
||||
|
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
|||
continue
|
||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||
align_x2y = eg.alignment.x2y
|
||||
preds = set()
|
||||
for pred_ent in eg.x.ents:
|
||||
if pred_ent.label_ not in scores:
|
||||
scores[pred_ent.label_] = PRFScore()
|
||||
|
|
|
@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
|
|||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
# heads override sent_starts
|
||||
doc = Doc(
|
||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
|
||||
doc = de_tokenizer("Er lag auf seinem")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
|
||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -7,8 +7,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||
doc = en_tokenizer("This is a sentence")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
|
||||
doc = es_tokenizer("en Oxford este verano")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
|
||||
|
||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
|||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
|
||||
)
|
||||
@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
|
||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
|
||||
doc = id_tokenizer("sebelas")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
|
||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||
)
|
||||
def test_ja_tokenizer_sub_tokens(
|
||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
|
||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
|
||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
|
||||
)
|
||||
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
||||
tokens = ne_tokenizer(text)
|
||||
|
|
|
@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
|
|||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
|
||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
|
||||
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
||||
],
|
||||
)
|
||||
|
|
|
@ -3,8 +3,7 @@ from spacy.tokens import Doc
|
|||
|
||||
|
||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||
"""
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
|
||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
|
||||
# adding aliases
|
||||
douglas_hash = mykb.add_alias(
|
||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
|
||||
)
|
||||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
candidates = mykb.get_alias_candidates("adam")
|
||||
|
|
|
@ -226,6 +226,7 @@ def test_positive_class_not_binary():
|
|||
with pytest.raises(ValueError):
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
|
||||
def test_textcat_evaluation():
|
||||
train_examples = []
|
||||
nlp = English()
|
||||
|
@ -241,15 +242,17 @@ def test_textcat_evaluation():
|
|||
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
||||
train_examples.append(Example(pred2, ref2))
|
||||
|
||||
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
|
||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
|
||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
|
||||
scores = Scorer().score_cats(
|
||||
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
|
||||
)
|
||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
|
||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
|
||||
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
|
||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
|
||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
|
||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
|
||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
|
||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
|
||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
|
||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
|
||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
|
||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
|
||||
|
||||
assert scores["cats_micro_p"] == 4/5
|
||||
assert scores["cats_micro_r"] == 4/6
|
||||
assert scores["cats_micro_p"] == 4 / 5
|
||||
assert scores["cats_micro_r"] == 4 / 6
|
||||
|
|
|
@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
|
|||
encode_config["width"] = width
|
||||
docs = get_batch(3)
|
||||
tok2vec = build_Tok2Vec_model(
|
||||
embed_arch(**embed_config),
|
||||
encode_arch(**encode_config)
|
||||
embed_arch(**embed_config), encode_arch(**encode_config)
|
||||
)
|
||||
tok2vec.initialize(docs)
|
||||
vectors, backprop = tok2vec.begin_update(docs)
|
||||
|
|
|
@ -229,9 +229,7 @@ def test_issue3611():
|
|||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
|
@ -390,7 +388,7 @@ def test_issue3959():
|
|||
|
||||
|
||||
def test_issue3962(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||
|
@ -428,7 +426,7 @@ def test_issue3962(en_vocab):
|
|||
|
||||
|
||||
def test_issue3962_long(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||
|
@ -463,8 +461,7 @@ def test_issue3962_long(en_vocab):
|
|||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
|
|
|
@ -19,8 +19,7 @@ from ..util import make_tempdir
|
|||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
|
@ -72,9 +71,7 @@ def test_issue4030():
|
|||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
|
|
|
@ -7,7 +7,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
|||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||
from thinc.api import ConfigValidationError, Config
|
||||
from thinc.api import ConfigValidationError
|
||||
import srsly
|
||||
import os
|
||||
|
||||
|
|
|
@ -290,9 +290,7 @@ def test_spacy_blank():
|
|||
assert nlp.meta["name"] == "my_custom_model"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value", [False, None, ["x", "y"], Language, Vocab],
|
||||
)
|
||||
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
||||
def test_language_init_invalid_vocab(value):
|
||||
err_fragment = "invalid value"
|
||||
with pytest.raises(ValueError) as e:
|
||||
|
|
|
@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
|
|||
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
||||
),
|
||||
"encode": MaxoutWindowEncoder(
|
||||
width=32, depth=2, maxout_pieces=2, window_size=1,
|
||||
width=32, depth=2, maxout_pieces=2, window_size=1
|
||||
),
|
||||
}
|
||||
|
||||
|
|
|
@ -137,7 +137,7 @@ def test_las_per_type(en_vocab):
|
|||
examples = []
|
||||
for input_, annot in test_las_apple:
|
||||
doc = Doc(
|
||||
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
|
||||
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
|
||||
)
|
||||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||
example = Example.from_dict(doc, gold)
|
||||
|
|
|
@ -496,8 +496,10 @@ def test_make_orth_variants(doc):
|
|||
output_file = tmpdir / "roundtrip.spacy"
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
|
||||
train_examples = list(reader(nlp))
|
||||
reader = Corpus(
|
||||
output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
|
||||
)
|
||||
list(reader(nlp))
|
||||
|
||||
|
||||
@pytest.mark.skip("Outdated")
|
||||
|
|
|
@ -23,7 +23,7 @@ def dont_augment(nlp, example):
|
|||
yield example
|
||||
|
||||
|
||||
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
|
||||
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
|
||||
if random.random() >= level:
|
||||
yield example
|
||||
else:
|
||||
|
@ -36,14 +36,14 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.
|
|||
nlp,
|
||||
raw_text,
|
||||
orig_dict["token_annotation"],
|
||||
lower=raw_text is not None and random.random() < lower
|
||||
lower=raw_text is not None and random.random() < lower,
|
||||
)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
yield example.from_dict(doc, orig_dict)
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
|
||||
def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
|
||||
orig_token_dict = copy.deepcopy(token_dict)
|
||||
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
||||
ndsv = orth_variants.get("single", [])
|
||||
|
|
|
@ -188,8 +188,8 @@ def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
|
|||
|
||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||
"""RETURNS (List[str]): All sourced components in the original config,
|
||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||
"factory", we assume it refers to a component factory.
|
||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||
"factory", we assume it refers to a component factory.
|
||||
"""
|
||||
return [
|
||||
name
|
||||
|
|
|
@ -94,7 +94,7 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
|||
|
||||
|
||||
def _resume_model(
|
||||
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
|
||||
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
|
||||
) -> None:
|
||||
msg = Printer(no_print=silent)
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
|
|
|
@ -488,7 +488,7 @@ def load_config_from_str(
|
|||
RETURNS (Config): The loaded config.
|
||||
"""
|
||||
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
||||
text, overrides=overrides, interpolate=interpolate,
|
||||
text, overrides=overrides, interpolate=interpolate
|
||||
)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user