mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'develop' into feature/component-requirements
This commit is contained in:
commit
441345a087
|
@ -20,6 +20,7 @@ import spacy
|
|||
from spacy import util
|
||||
from spacy.util import minibatch, compounding
|
||||
from spacy.gold import Example
|
||||
from thinc.api import Config
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
|
@ -42,8 +43,8 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non
|
|||
output_dir.mkdir()
|
||||
|
||||
print(f"Loading nlp model from {config_path}")
|
||||
nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
nlp_config = Config().from_disk(config_path)
|
||||
nlp, _ = util.load_model_from_config(nlp_config, auto_fill=True)
|
||||
|
||||
# ensure the nlp object was defined with a textcat component
|
||||
if "textcat" not in nlp.pipe_names:
|
||||
|
|
|
@ -1,19 +1,14 @@
|
|||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["textcat"]
|
||||
|
||||
[nlp.pipeline.textcat]
|
||||
[components]
|
||||
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
[nlp.pipeline.textcat.model]
|
||||
@architectures = "spacy.TextCatCNN.v1"
|
||||
exclusive_classes = false
|
||||
|
||||
[nlp.pipeline.textcat.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
|
|
@ -8,6 +8,7 @@ import typer
|
|||
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
|
||||
from .. import util
|
||||
from ..lang.en import English
|
||||
from ..util import dot_to_object
|
||||
|
||||
|
||||
@debug_cli.command("model")
|
||||
|
@ -60,16 +61,7 @@ def debug_model_cli(
|
|||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
|
||||
component = config
|
||||
parts = section.split(".")
|
||||
for item in parts:
|
||||
try:
|
||||
component = component[item]
|
||||
except KeyError:
|
||||
msg.fail(
|
||||
f"The section '{section}' is not a valid section in the provided config.",
|
||||
exits=1,
|
||||
)
|
||||
component = dot_to_object(config, section)
|
||||
if hasattr(component, "model"):
|
||||
model = component.model
|
||||
else:
|
||||
|
|
|
@ -483,7 +483,8 @@ class Errors:
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E952 = ("Invalid requirement specified by component '{name}': {req}")
|
||||
E951 = ("Invalid requirement specified by component '{name}': {req}")
|
||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
||||
E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
|
@ -593,7 +594,7 @@ class Errors:
|
|||
"for the `nlp` pipeline with components {names}.")
|
||||
E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
|
||||
"the code of the language to initialize it with (for example "
|
||||
"'en' for English).\n\n{config}")
|
||||
"'en' for English) - this can't be 'None'.\n\n{config}")
|
||||
E996 = ("Could not parse {file}: {msg}")
|
||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
|
|
|
@ -213,6 +213,9 @@ class Language:
|
|||
self._meta["labels"] = self.pipe_labels
|
||||
reqs = {p: self.get_pipe_meta(p).package_requirements for p in self.pipe_names}
|
||||
self._meta["requirements"].extend(util.merge_pipe_requirements(reqs))
|
||||
# TODO: Adding this back to prevent breaking people's code etc., but
|
||||
# we should consider removing it
|
||||
self._meta["pipeline"] = self.pipe_names
|
||||
return self._meta
|
||||
|
||||
@meta.setter
|
||||
|
|
|
@ -2,7 +2,13 @@ import pytest
|
|||
|
||||
from .util import get_random_doc
|
||||
|
||||
from spacy.util import minibatch_by_words
|
||||
from spacy import util
|
||||
from spacy.util import minibatch_by_words, dot_to_object
|
||||
from thinc.api import Config, Optimizer
|
||||
|
||||
from ..lang.en import English
|
||||
from ..lang.nl import Dutch
|
||||
from ..language import DEFAULT_CONFIG_PATH
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -56,3 +62,49 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
|||
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
|
||||
)
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
|
||||
def test_util_dot_section():
|
||||
cfg_string = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["textcat"]
|
||||
load_vocab_data = false
|
||||
|
||||
[components]
|
||||
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
nlp_config = Config().from_str(cfg_string)
|
||||
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
|
||||
|
||||
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||
default_config["nlp"]["lang"] = "nl"
|
||||
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
|
||||
|
||||
# Test that creation went OK
|
||||
assert isinstance(en_nlp, English)
|
||||
assert isinstance(nl_nlp, Dutch)
|
||||
assert nl_nlp.pipe_names == []
|
||||
assert en_nlp.pipe_names == ["textcat"]
|
||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes
|
||||
|
||||
# Test that default values got overwritten
|
||||
assert not en_config["nlp"]["load_vocab_data"]
|
||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
||||
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
obj = dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
with pytest.raises(KeyError):
|
||||
obj = dot_to_object(en_config, "nlp.unknownattribute")
|
||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||
|
|
|
@ -259,7 +259,7 @@ def load_model_from_config(
|
|||
if "nlp" not in config:
|
||||
raise ValueError(Errors.E985.format(config=config))
|
||||
nlp_config = config["nlp"]
|
||||
if "lang" not in nlp_config:
|
||||
if "lang" not in nlp_config or nlp_config["lang"] is None:
|
||||
raise ValueError(Errors.E993.format(config=nlp_config))
|
||||
# This will automatically handle all codes registered via the languages
|
||||
# registry, including custom subclasses provided via entry points
|
||||
|
@ -1136,6 +1136,25 @@ def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
|
|||
return {".".join(key): value for key, value in walk_dict(obj)}
|
||||
|
||||
|
||||
def dot_to_object(config: Config, section: str):
|
||||
"""Convert dot notation of a "section" to a specific part of the Config.
|
||||
e.g. "training.optimizer" would return the Optimizer object.
|
||||
Throws an error if the section is not defined in this config.
|
||||
|
||||
config (Config): The config.
|
||||
section (str): The dot notation of the section in the config.
|
||||
RETURNS: The object denoted by the section
|
||||
"""
|
||||
component = config
|
||||
parts = section.split(".")
|
||||
for item in parts:
|
||||
try:
|
||||
component = component[item]
|
||||
except (KeyError, TypeError):
|
||||
raise KeyError(Errors.E952.format(name=section))
|
||||
return component
|
||||
|
||||
|
||||
def walk_dict(
|
||||
node: Dict[str, Any], parent: List[str] = []
|
||||
) -> Iterator[Tuple[List[str], Any]]:
|
||||
|
|
Loading…
Reference in New Issue
Block a user