mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Tidy up and auto-format
This commit is contained in:
parent
9d32cac736
commit
1114219ae3
|
@ -6,7 +6,6 @@ from wasabi import msg
|
|||
import srsly
|
||||
import hashlib
|
||||
import typer
|
||||
import subprocess
|
||||
from click import NoSuchOption
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
|
@ -327,7 +326,7 @@ def git_checkout(
|
|||
)
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
|
|
@ -156,11 +156,7 @@ class Language:
|
|||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(
|
||||
self.lang,
|
||||
self.Defaults,
|
||||
vectors_name=vectors_name,
|
||||
)
|
||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -1462,7 +1458,7 @@ class Language:
|
|||
# here :(
|
||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||
if hasattr(proc1, "find_listeners"):
|
||||
for name2, proc2 in self.pipeline[i+1:]:
|
||||
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||
if isinstance(getattr(proc2, "model", None), Model):
|
||||
proc1.find_listeners(proc2.model)
|
||||
|
||||
|
|
|
@ -164,7 +164,9 @@ def MultiHashEmbed(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
|
||||
def CharacterEmbed(
|
||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
||||
):
|
||||
"""Construct an embedded representation based on character embeddings, using
|
||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||
each word, taken from the beginning and end of the word equally. Padding is
|
||||
|
@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
|
|||
),
|
||||
StaticVectors(width, dropout=0.0),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
|
||||
with_array(
|
||||
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
||||
),
|
||||
ragged2list(),
|
||||
)
|
||||
)
|
||||
else:
|
||||
model = chain(
|
||||
concatenate(
|
||||
|
@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
|
|||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||
with_array(
|
||||
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
||||
),
|
||||
ragged2list(),
|
||||
)
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
|
||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
|
@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||
|
||||
|
||||
# TODO: use a more detailed schema for this?
|
||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
||||
# fmt: on
|
||||
|
|
|
@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[2, 1, 1, 0],
|
||||
deps=["dep"] * 4,
|
||||
)
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
assert lca.shape == (2, 2)
|
||||
assert lca[0, 0] == 0 # the & the -> the
|
||||
|
|
|
@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
|||
|
||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
deps=["dep"] * len(heads),
|
||||
)
|
||||
|
||||
lefts = {}
|
||||
rights = {}
|
||||
|
|
|
@ -345,10 +345,7 @@ def test_language_factories_invalid():
|
|||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
||||
),
|
||||
(
|
||||
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
|
||||
{"a": 0.25, "b": 0.75},
|
||||
),
|
||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
|
||||
],
|
||||
)
|
||||
def test_language_factories_combine_score_weights(weights, expected):
|
||||
|
@ -363,16 +360,10 @@ def test_language_factories_scores():
|
|||
weights1 = {"a1": 0.5, "a2": 0.5}
|
||||
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
||||
Language.factory(
|
||||
f"{name}1",
|
||||
scores=list(weights1),
|
||||
default_score_weights=weights1,
|
||||
func=func,
|
||||
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
|
||||
)
|
||||
Language.factory(
|
||||
f"{name}2",
|
||||
scores=list(weights2),
|
||||
default_score_weights=weights2,
|
||||
func=func,
|
||||
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
|
||||
)
|
||||
meta1 = Language.get_factory_meta(f"{name}1")
|
||||
assert meta1.default_score_weights == weights1
|
||||
|
|
|
@ -212,9 +212,17 @@ def test_issue1834():
|
|||
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
||||
deps=["dep"] * len(words),
|
||||
)
|
||||
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
|
||||
print(
|
||||
doc.has_annotation("DEP"),
|
||||
[t.head.i for t in doc],
|
||||
[t.is_sent_start for t in doc],
|
||||
)
|
||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
|
||||
print(
|
||||
new_doc.has_annotation("DEP"),
|
||||
[t.head.i for t in new_doc],
|
||||
[t.is_sent_start for t in new_doc],
|
||||
)
|
||||
assert new_doc[6].sent_start
|
||||
assert new_doc.has_annotation("DEP")
|
||||
assert new_doc.has_annotation("TAG")
|
||||
|
|
|
@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
|
|||
# See issue #1105
|
||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
|
||||
textcat = TextCategorizer(
|
||||
en_vocab,
|
||||
model,
|
||||
labels=["ENTITY", "ACTION", "MODIFIER"],
|
||||
threshold=0.5,
|
||||
positive_label=None,
|
||||
)
|
||||
textcat.to_bytes(exclude=["vocab"])
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from click import NoSuchOption
|
|||
|
||||
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
||||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
|
|
|
@ -291,8 +291,7 @@ def test_spacy_blank():
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value",
|
||||
[False, None, ["x", "y"], Language, Vocab],
|
||||
"value", [False, None, ["x", "y"], Language, Vocab],
|
||||
)
|
||||
def test_language_init_invalid_vocab(value):
|
||||
err_fragment = "invalid value"
|
||||
|
|
|
@ -95,7 +95,7 @@ def test_util_dot_section():
|
|||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||
# Test that default values got overwritten
|
||||
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from typing import Dict, Iterable, Callable
|
||||
import pytest
|
||||
from thinc.api import Config
|
||||
|
||||
from spacy import Language
|
||||
from spacy.util import load_model_from_config, registry, dot_to_object
|
||||
from spacy.training import Example
|
||||
|
@ -10,19 +9,19 @@ from spacy.training import Example
|
|||
def test_readers():
|
||||
config_string = """
|
||||
[training]
|
||||
|
||||
|
||||
[corpora]
|
||||
@readers = "myreader.v1"
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "textcat"]
|
||||
|
||||
|
||||
[components]
|
||||
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
"""
|
||||
|
@ -69,19 +68,19 @@ def test_readers():
|
|||
def test_cat_readers(reader, additional_config):
|
||||
nlp_config_string = """
|
||||
[training]
|
||||
|
||||
|
||||
[corpora]
|
||||
@readers = "PLACEHOLDER"
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "textcat"]
|
||||
|
||||
|
||||
[components]
|
||||
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
"""
|
||||
|
|
|
@ -34,7 +34,17 @@ def doc():
|
|||
# fmt: on
|
||||
nlp = English()
|
||||
words = [t.text for t in nlp.make_doc(text)]
|
||||
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
|
||||
doc = get_doc(
|
||||
nlp.vocab,
|
||||
words=words,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
morphs=morphs,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
lemmas=lemmas,
|
||||
ents=ents,
|
||||
)
|
||||
doc.cats = cats
|
||||
return doc
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user