mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Tidy up and auto-format
This commit is contained in:
parent
9d32cac736
commit
1114219ae3
|
@ -6,7 +6,6 @@ from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
import hashlib
|
import hashlib
|
||||||
import typer
|
import typer
|
||||||
import subprocess
|
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
@ -327,7 +326,7 @@ def git_checkout(
|
||||||
)
|
)
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||||
ret = run_command(cmd, capture=True)
|
run_command(cmd, capture=True)
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
|
|
||||||
|
|
|
@ -156,11 +156,7 @@ class Language:
|
||||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vectors_name = meta.get("vectors", {}).get("name")
|
||||||
vocab = create_vocab(
|
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||||
self.lang,
|
|
||||||
self.Defaults,
|
|
||||||
vectors_name=vectors_name,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
|
@ -1462,7 +1458,7 @@ class Language:
|
||||||
# here :(
|
# here :(
|
||||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||||
if hasattr(proc1, "find_listeners"):
|
if hasattr(proc1, "find_listeners"):
|
||||||
for name2, proc2 in self.pipeline[i+1:]:
|
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||||
if isinstance(getattr(proc2, "model", None), Model):
|
if isinstance(getattr(proc2, "model", None), Model):
|
||||||
proc1.find_listeners(proc2.model)
|
proc1.find_listeners(proc2.model)
|
||||||
|
|
||||||
|
|
|
@ -164,7 +164,9 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
|
def CharacterEmbed(
|
||||||
|
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
||||||
|
):
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
each word, taken from the beginning and end of the word equally. Padding is
|
each word, taken from the beginning and end of the word equally. Padding is
|
||||||
|
@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
|
||||||
),
|
),
|
||||||
StaticVectors(width, dropout=0.0),
|
StaticVectors(width, dropout=0.0),
|
||||||
),
|
),
|
||||||
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
|
with_array(
|
||||||
|
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
||||||
|
),
|
||||||
ragged2list(),
|
ragged2list(),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
|
@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
with_array(
|
||||||
|
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
||||||
|
),
|
||||||
ragged2list(),
|
ragged2list(),
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
|
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator
|
||||||
|
|
|
@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
def test_spans_lca_matrix(en_tokenizer):
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
"""Test span's lca matrix generation"""
|
"""Test span's lca matrix generation"""
|
||||||
tokens = en_tokenizer("the lazy dog slept")
|
tokens = en_tokenizer("the lazy dog slept")
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
|
doc = get_doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=[2, 1, 1, 0],
|
||||||
|
deps=["dep"] * 4,
|
||||||
|
)
|
||||||
lca = doc[:2].get_lca_matrix()
|
lca = doc[:2].get_lca_matrix()
|
||||||
assert lca.shape == (2, 2)
|
assert lca.shape == (2, 2)
|
||||||
assert lca[0, 0] == 0 # the & the -> the
|
assert lca[0, 0] == 0 # the & the -> the
|
||||||
|
|
|
@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
||||||
|
|
||||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
|
doc = get_doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=heads,
|
||||||
|
deps=["dep"] * len(heads),
|
||||||
|
)
|
||||||
|
|
||||||
lefts = {}
|
lefts = {}
|
||||||
rights = {}
|
rights = {}
|
||||||
|
|
|
@ -345,10 +345,7 @@ def test_language_factories_invalid():
|
||||||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
||||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
||||||
),
|
),
|
||||||
(
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
|
||||||
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
|
|
||||||
{"a": 0.25, "b": 0.75},
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_language_factories_combine_score_weights(weights, expected):
|
def test_language_factories_combine_score_weights(weights, expected):
|
||||||
|
@ -363,16 +360,10 @@ def test_language_factories_scores():
|
||||||
weights1 = {"a1": 0.5, "a2": 0.5}
|
weights1 = {"a1": 0.5, "a2": 0.5}
|
||||||
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
||||||
Language.factory(
|
Language.factory(
|
||||||
f"{name}1",
|
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
|
||||||
scores=list(weights1),
|
|
||||||
default_score_weights=weights1,
|
|
||||||
func=func,
|
|
||||||
)
|
)
|
||||||
Language.factory(
|
Language.factory(
|
||||||
f"{name}2",
|
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
|
||||||
scores=list(weights2),
|
|
||||||
default_score_weights=weights2,
|
|
||||||
func=func,
|
|
||||||
)
|
)
|
||||||
meta1 = Language.get_factory_meta(f"{name}1")
|
meta1 = Language.get_factory_meta(f"{name}1")
|
||||||
assert meta1.default_score_weights == weights1
|
assert meta1.default_score_weights == weights1
|
||||||
|
|
|
@ -212,9 +212,17 @@ def test_issue1834():
|
||||||
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
||||||
deps=["dep"] * len(words),
|
deps=["dep"] * len(words),
|
||||||
)
|
)
|
||||||
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
|
print(
|
||||||
|
doc.has_annotation("DEP"),
|
||||||
|
[t.head.i for t in doc],
|
||||||
|
[t.is_sent_start for t in doc],
|
||||||
|
)
|
||||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||||
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
|
print(
|
||||||
|
new_doc.has_annotation("DEP"),
|
||||||
|
[t.head.i for t in new_doc],
|
||||||
|
[t.is_sent_start for t in new_doc],
|
||||||
|
)
|
||||||
assert new_doc[6].sent_start
|
assert new_doc[6].sent_start
|
||||||
assert new_doc.has_annotation("DEP")
|
assert new_doc.has_annotation("DEP")
|
||||||
assert new_doc.has_annotation("TAG")
|
assert new_doc.has_annotation("TAG")
|
||||||
|
|
|
@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||||
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
|
textcat = TextCategorizer(
|
||||||
|
en_vocab,
|
||||||
|
model,
|
||||||
|
labels=["ENTITY", "ACTION", "MODIFIER"],
|
||||||
|
threshold=0.5,
|
||||||
|
positive_label=None,
|
||||||
|
)
|
||||||
textcat.to_bytes(exclude=["vocab"])
|
textcat.to_bytes(exclude=["vocab"])
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ from click import NoSuchOption
|
||||||
|
|
||||||
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
||||||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
|
|
|
@ -291,8 +291,7 @@ def test_spacy_blank():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"value",
|
"value", [False, None, ["x", "y"], Language, Vocab],
|
||||||
[False, None, ["x", "y"], Language, Vocab],
|
|
||||||
)
|
)
|
||||||
def test_language_init_invalid_vocab(value):
|
def test_language_init_invalid_vocab(value):
|
||||||
err_fragment = "invalid value"
|
err_fragment = "invalid value"
|
||||||
|
|
|
@ -95,7 +95,7 @@ def test_util_dot_section():
|
||||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||||
# Test that default values got overwritten
|
# Test that default values got overwritten
|
||||||
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||||
# Test proper functioning of 'dot_to_object'
|
# Test proper functioning of 'dot_to_object'
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from typing import Dict, Iterable, Callable
|
from typing import Dict, Iterable, Callable
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
from spacy.util import load_model_from_config, registry, dot_to_object
|
from spacy.util import load_model_from_config, registry, dot_to_object
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
|
|
|
@ -34,7 +34,17 @@ def doc():
|
||||||
# fmt: on
|
# fmt: on
|
||||||
nlp = English()
|
nlp = English()
|
||||||
words = [t.text for t in nlp.make_doc(text)]
|
words = [t.text for t in nlp.make_doc(text)]
|
||||||
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
|
doc = get_doc(
|
||||||
|
nlp.vocab,
|
||||||
|
words=words,
|
||||||
|
tags=tags,
|
||||||
|
pos=pos,
|
||||||
|
morphs=morphs,
|
||||||
|
heads=heads,
|
||||||
|
deps=deps,
|
||||||
|
lemmas=lemmas,
|
||||||
|
ents=ents,
|
||||||
|
)
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user