mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and auto-format
This commit is contained in:
parent
604be54a5c
commit
fa47f87924
|
@ -9,7 +9,8 @@ import sys
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
|
|
@ -27,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||||
)
|
)
|
||||||
@app.command(
|
@app.command(
|
||||||
"debug-data",
|
"debug-data",
|
||||||
|
|
|
@ -134,7 +134,7 @@ def update_dvc_config(
|
||||||
|
|
||||||
|
|
||||||
def run_dvc_commands(
|
def run_dvc_commands(
|
||||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,7 @@ from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
class EnglishLemmatizer(Lemmatizer):
|
class EnglishLemmatizer(Lemmatizer):
|
||||||
"""English lemmatizer. Only overrides is_base_form.
|
"""English lemmatizer. Only overrides is_base_form."""
|
||||||
"""
|
|
||||||
|
|
||||||
def is_base_form(self, token: Token) -> bool:
|
def is_base_form(self, token: Token) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -58,7 +58,7 @@ def noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
)
|
)
|
||||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
if list(filter(filter_func, doc[left_bound.i : right.i])):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
right_bound = right
|
right_bound = right
|
||||||
|
|
|
@ -108,8 +108,8 @@ _num_words = [
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
Check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
|
|
@ -17,7 +17,7 @@ use_pyvi = true
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
|
|
|
@ -1189,7 +1189,7 @@ class Language:
|
||||||
# These are the settings provided in the [initialize] block in the config
|
# These are the settings provided in the [initialize] block in the config
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
init_vocab(
|
init_vocab(
|
||||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
)
|
)
|
||||||
pretrain_cfg = config.get("pretraining")
|
pretrain_cfg = config.get("pretraining")
|
||||||
if pretrain_cfg:
|
if pretrain_cfg:
|
||||||
|
|
|
@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(
|
def analyze_pipes(
|
||||||
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
|
||||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
|
|
|
@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
|
||||||
matches = self.matcher(doc, allow_missing=True)
|
matches = self.matcher(doc, allow_missing=True)
|
||||||
# Sort by the attribute ID, so that later rules have precendence
|
# Sort by the attribute ID, so that later rules have precendence
|
||||||
matches = [
|
matches = [
|
||||||
(int(self.vocab.strings[m_id]), m_id, s, e)
|
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
|
||||||
for m_id, s, e in matches
|
|
||||||
]
|
]
|
||||||
matches.sort()
|
matches.sort()
|
||||||
for attr_id, match_id, start, end in matches:
|
for attr_id, match_id, start, end in matches:
|
||||||
|
@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
|
||||||
try:
|
try:
|
||||||
# The index can be negative, which makes it annoying to do
|
# The index can be negative, which makes it annoying to do
|
||||||
# the boundscheck. Let Span do it instead.
|
# the boundscheck. Let Span do it instead.
|
||||||
token = span[index]
|
token = span[index] # noqa: F841
|
||||||
except IndexError:
|
except IndexError:
|
||||||
# The original exception is just our conditional logic, so we
|
# The original exception is just our conditional logic, so we
|
||||||
# raise from.
|
# raise from.
|
||||||
|
@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
|
||||||
span=[t.text for t in span],
|
span=[t.text for t in span],
|
||||||
index=index,
|
index=index,
|
||||||
)
|
)
|
||||||
) from None
|
) from None
|
||||||
set_token_attrs(span[index], attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,7 @@ class Lemmatizer(Pipe):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
|
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
|
||||||
"""Load and validate lookups tables. If the provided lookups is None,
|
"""Load and validate lookups tables. If the provided lookups is None,
|
||||||
load the default lookups tables according to the language and mode
|
load the default lookups tables according to the language and mode
|
||||||
settings. Confirm that all required tables for the language and mode
|
settings. Confirm that all required tables for the language and mode
|
||||||
|
|
|
@ -347,7 +347,7 @@ class TextCategorizer(Pipe):
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
nlp: Optional[Language] = None,
|
nlp: Optional[Language] = None,
|
||||||
labels: Optional[Dict] = None
|
labels: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
|
@ -132,7 +132,7 @@ def validate_init_settings(
|
||||||
block = "initialize" if not section else f"initialize.{section}"
|
block = "initialize" if not section else f"initialize.{section}"
|
||||||
title = f"Error validating initialization settings in [{block}]"
|
title = f"Error validating initialization settings in [{block}]"
|
||||||
raise ConfigValidationError(
|
raise ConfigValidationError(
|
||||||
title=title, errors=e.errors(), config=settings, parent=name,
|
title=title, errors=e.errors(), config=settings, parent=name
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -32,9 +32,7 @@ class PRFScore:
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
return PRFScore(
|
return PRFScore(
|
||||||
tp=self.tp+other.tp,
|
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
|
||||||
fp=self.fp+other.fp,
|
|
||||||
fn=self.fn+other.fn
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def score_set(self, cand: set, gold: set) -> None:
|
def score_set(self, cand: set, gold: set) -> None:
|
||||||
|
@ -485,7 +483,7 @@ class Scorer:
|
||||||
(pred_ent.start_char, pred_ent.end_char), None
|
(pred_ent.start_char, pred_ent.end_char), None
|
||||||
)
|
)
|
||||||
label = gold_span.label_
|
label = gold_span.label_
|
||||||
if not label in f_per_type:
|
if label not in f_per_type:
|
||||||
f_per_type[label] = PRFScore()
|
f_per_type[label] = PRFScore()
|
||||||
gold = gold_span.kb_id_
|
gold = gold_span.kb_id_
|
||||||
# only evaluating entities that overlap between gold and pred,
|
# only evaluating entities that overlap between gold and pred,
|
||||||
|
@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
|
||||||
continue
|
continue
|
||||||
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
|
||||||
align_x2y = eg.alignment.x2y
|
align_x2y = eg.alignment.x2y
|
||||||
preds = set()
|
|
||||||
for pred_ent in eg.x.ents:
|
for pred_ent in eg.x.ents:
|
||||||
if pred_ent.label_ not in scores:
|
if pred_ent.label_ not in scores:
|
||||||
scores[pred_ent.label_] = PRFScore()
|
scores[pred_ent.label_] = PRFScore()
|
||||||
|
|
|
@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
|
||||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
# heads override sent_starts
|
# heads override sent_starts
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
|
||||||
)
|
)
|
||||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = de_tokenizer("Er lag auf seinem")
|
doc = de_tokenizer("Er lag auf seinem")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -7,8 +7,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = en_tokenizer("This is a sentence")
|
doc = en_tokenizer("This is a sentence")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = es_tokenizer("en Oxford este verano")
|
doc = es_tokenizer("en Oxford este verano")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
|
|
||||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
|
||||||
"text", ["janv.", "juill.", "Dr.", "av.", "sept."],
|
|
||||||
)
|
|
||||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = id_tokenizer("sebelas")
|
doc = id_tokenizer("sebelas")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
|
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_sub_tokens(
|
def test_ja_tokenizer_sub_tokens(
|
||||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
|
"text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
|
||||||
)
|
)
|
||||||
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
|
||||||
tokens = ne_tokenizer(text)
|
tokens = ne_tokenizer(text)
|
||||||
|
|
|
@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
|
("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
|
||||||
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,8 +3,7 @@ from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
|
||||||
"""
|
|
||||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
|
||||||
# adding entities
|
# adding entities
|
||||||
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||||
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||||
|
|
||||||
# adding aliases
|
# adding aliases
|
||||||
douglas_hash = mykb.add_alias(
|
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
|
|
||||||
)
|
|
||||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
candidates = mykb.get_alias_candidates("adam")
|
candidates = mykb.get_alias_candidates("adam")
|
||||||
|
|
|
@ -226,6 +226,7 @@ def test_positive_class_not_binary():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
verify_textcat_config(nlp, pipe_config)
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_evaluation():
|
def test_textcat_evaluation():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -241,15 +242,17 @@ def test_textcat_evaluation():
|
||||||
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
||||||
train_examples.append(Example(pred2, ref2))
|
train_examples.append(Example(pred2, ref2))
|
||||||
|
|
||||||
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
|
scores = Scorer().score_cats(
|
||||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
|
train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
|
||||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
|
)
|
||||||
|
assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
|
||||||
|
assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
|
||||||
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
||||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
|
assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
|
||||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
|
assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
|
||||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
|
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
|
||||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
|
assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
|
||||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
|
assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
|
||||||
|
|
||||||
assert scores["cats_micro_p"] == 4/5
|
assert scores["cats_micro_p"] == 4 / 5
|
||||||
assert scores["cats_micro_r"] == 4/6
|
assert scores["cats_micro_r"] == 4 / 6
|
||||||
|
|
|
@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
|
||||||
encode_config["width"] = width
|
encode_config["width"] = width
|
||||||
docs = get_batch(3)
|
docs = get_batch(3)
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
embed_arch(**embed_config),
|
embed_arch(**embed_config), encode_arch(**encode_config)
|
||||||
encode_arch(**encode_config)
|
|
||||||
)
|
)
|
||||||
tok2vec.initialize(docs)
|
tok2vec.initialize(docs)
|
||||||
vectors, backprop = tok2vec.begin_update(docs)
|
vectors, backprop = tok2vec.begin_update(docs)
|
||||||
|
|
|
@ -229,9 +229,7 @@ def test_issue3611():
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(
|
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3625():
|
def test_issue3625():
|
||||||
|
@ -390,7 +388,7 @@ def test_issue3959():
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962(en_vocab):
|
def test_issue3962(en_vocab):
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||||
|
@ -428,7 +426,7 @@ def test_issue3962(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962_long(en_vocab):
|
def test_issue3962_long(en_vocab):
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||||
|
@ -463,8 +461,7 @@ def test_issue3962_long(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_issue3972(en_vocab):
|
def test_issue3972(en_vocab):
|
||||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
|
|
@ -19,8 +19,7 @@ from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_issue4002(en_vocab):
|
def test_issue4002(en_vocab):
|
||||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||||
|
@ -72,9 +71,7 @@ def test_issue4030():
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(
|
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
# processing of an empty doc should result in 0.0 for all categories
|
# processing of an empty doc should result in 0.0 for all categories
|
||||||
doc = nlp("")
|
doc = nlp("")
|
||||||
assert doc.cats["offensive"] == 0.0
|
assert doc.cats["offensive"] == 0.0
|
||||||
|
|
|
@ -7,7 +7,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||||
from thinc.api import ConfigValidationError, Config
|
from thinc.api import ConfigValidationError
|
||||||
import srsly
|
import srsly
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
|
@ -290,9 +290,7 @@ def test_spacy_blank():
|
||||||
assert nlp.meta["name"] == "my_custom_model"
|
assert nlp.meta["name"] == "my_custom_model"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
|
||||||
"value", [False, None, ["x", "y"], Language, Vocab],
|
|
||||||
)
|
|
||||||
def test_language_init_invalid_vocab(value):
|
def test_language_init_invalid_vocab(value):
|
||||||
err_fragment = "invalid value"
|
err_fragment = "invalid value"
|
||||||
with pytest.raises(ValueError) as e:
|
with pytest.raises(ValueError) as e:
|
||||||
|
|
|
@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
|
||||||
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
||||||
),
|
),
|
||||||
"encode": MaxoutWindowEncoder(
|
"encode": MaxoutWindowEncoder(
|
||||||
width=32, depth=2, maxout_pieces=2, window_size=1,
|
width=32, depth=2, maxout_pieces=2, window_size=1
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -137,7 +137,7 @@ def test_las_per_type(en_vocab):
|
||||||
examples = []
|
examples = []
|
||||||
for input_, annot in test_las_apple:
|
for input_, annot in test_las_apple:
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
|
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
|
||||||
)
|
)
|
||||||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||||
example = Example.from_dict(doc, gold)
|
example = Example.from_dict(doc, gold)
|
||||||
|
|
|
@ -496,8 +496,10 @@ def test_make_orth_variants(doc):
|
||||||
output_file = tmpdir / "roundtrip.spacy"
|
output_file = tmpdir / "roundtrip.spacy"
|
||||||
DocBin(docs=[doc]).to_disk(output_file)
|
DocBin(docs=[doc]).to_disk(output_file)
|
||||||
# due to randomness, test only that this runs with no errors for now
|
# due to randomness, test only that this runs with no errors for now
|
||||||
reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
|
reader = Corpus(
|
||||||
train_examples = list(reader(nlp))
|
output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
|
||||||
|
)
|
||||||
|
list(reader(nlp))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("Outdated")
|
@pytest.mark.skip("Outdated")
|
||||||
|
|
|
@ -23,7 +23,7 @@ def dont_augment(nlp, example):
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
|
||||||
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
|
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
|
||||||
if random.random() >= level:
|
if random.random() >= level:
|
||||||
yield example
|
yield example
|
||||||
else:
|
else:
|
||||||
|
@ -36,14 +36,14 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.
|
||||||
nlp,
|
nlp,
|
||||||
raw_text,
|
raw_text,
|
||||||
orig_dict["token_annotation"],
|
orig_dict["token_annotation"],
|
||||||
lower=raw_text is not None and random.random() < lower
|
lower=raw_text is not None and random.random() < lower,
|
||||||
)
|
)
|
||||||
doc = nlp.make_doc(variant_text)
|
doc = nlp.make_doc(variant_text)
|
||||||
orig_dict["token_annotation"] = variant_token_annot
|
orig_dict["token_annotation"] = variant_token_annot
|
||||||
yield example.from_dict(doc, orig_dict)
|
yield example.from_dict(doc, orig_dict)
|
||||||
|
|
||||||
|
|
||||||
def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
|
def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
|
||||||
orig_token_dict = copy.deepcopy(token_dict)
|
orig_token_dict = copy.deepcopy(token_dict)
|
||||||
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
||||||
ndsv = orth_variants.get("single", [])
|
ndsv = orth_variants.get("single", [])
|
||||||
|
|
|
@ -188,8 +188,8 @@ def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
|
||||||
|
|
||||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||||
"""RETURNS (List[str]): All sourced components in the original config,
|
"""RETURNS (List[str]): All sourced components in the original config,
|
||||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||||
"factory", we assume it refers to a component factory.
|
"factory", we assume it refers to a component factory.
|
||||||
"""
|
"""
|
||||||
return [
|
return [
|
||||||
name
|
name
|
||||||
|
|
|
@ -94,7 +94,7 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(
|
def _resume_model(
|
||||||
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
|
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
|
||||||
) -> None:
|
) -> None:
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||||
|
|
|
@ -488,7 +488,7 @@ def load_config_from_str(
|
||||||
RETURNS (Config): The loaded config.
|
RETURNS (Config): The loaded config.
|
||||||
"""
|
"""
|
||||||
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
||||||
text, overrides=overrides, interpolate=interpolate,
|
text, overrides=overrides, interpolate=interpolate
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user