Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

This commit is contained in:
Ines Montani 2020-09-29 16:59:35 +02:00
commit 1c60f0b5e9
8 changed files with 39 additions and 54 deletions

View File

@ -34,7 +34,7 @@ def init_labels_cli(
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
nlp = init_nlp(config, use_gpu=use_gpu)
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
srsly.write_json(output_path / f"{name}.json", component.label_data)

View File

@ -56,7 +56,7 @@ def train_cli(
def init_pipeline(
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
) -> Language:
init_kwargs = {"use_gpu": use_gpu, "silent": False}
init_kwargs = {"use_gpu": use_gpu}
if output_path is not None:
init_path = output_path / "model-initial"
if not init_path.exists():
@ -74,12 +74,6 @@ def init_pipeline(
else:
msg.good(f"Loaded initialized pipeline from {init_path}")
return nlp
msg.warn(
"Not saving initialized model: no output directory specified. "
"To speed up training, spaCy can save the initialized nlp object with "
"the vocabulary, vectors and label scheme. To take advantage of this, "
"provide an output directory."
)
return init_nlp(config, **init_kwargs)

View File

@ -1181,24 +1181,9 @@ class Language:
)
doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
# Populate vocab
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err)
valid_examples = False
for example in get_examples():
if not isinstance(example, Example):
err = Errors.E978.format(
name="Language.initialize", types=type(example)
)
raise ValueError(err)
else:
valid_examples = True
for word in [t.text for t in example.reference]:
_ = self.vocab[word] # noqa: F841
if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err)
# Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config

View File

@ -35,10 +35,7 @@ cdef class Pipe:
@property
def labels(self) -> Optional[Tuple[str]]:
if "labels" in self.cfg:
return tuple(self.cfg["labels"])
else:
return None
return []
@property
def label_data(self):

View File

@ -266,7 +266,7 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def initialize(self, get_examples, *, nlp=None):
def initialize(self, get_examples, *, nlp=None, labels=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -277,8 +277,10 @@ class Tagger(Pipe):
DOCS: https://nightly.spacy.io/api/tagger#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []
label_sample = []
if labels is not None:
for tag in labels:
self.add_label(tag)
else:
tags = set()
for example in get_examples():
for token in example.y:
@ -286,6 +288,8 @@ class Tagger(Pipe):
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
doc_sample = []
label_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
gold_tags = example.get_aligned("TAG", as_string=True)

View File

@ -160,16 +160,12 @@ class TextCategorizer(Pipe):
self.cfg["labels"] = tuple(value)
@property
def label_data(self) -> Dict:
"""RETURNS (Dict): Information about the component's labels.
def label_data(self) -> List[str]:
"""RETURNS (List[str]): Information about the component's labels.
DOCS: https://nightly.spacy.io/api/textcategorizer#labels
"""
return {
"labels": self.labels,
"positive": self.cfg["positive_label"],
"threshold": self.cfg["threshold"]
}
return self.labels
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
@ -354,6 +350,7 @@ class TextCategorizer(Pipe):
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
labels: Optional[Dict] = None
):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -365,12 +362,14 @@ class TextCategorizer(Pipe):
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
"""
self._ensure_examples(get_examples)
subbatch = [] # Select a subbatch of examples to initialize the model
for example in islice(get_examples(), 10):
if len(subbatch) < 2:
subbatch.append(example)
if labels is None:
for example in get_examples():
for cat in example.y.cats:
self.add_label(cat)
else:
for label in labels:
self.add_label(label)
subbatch = list(islice(get_examples(), 10))
doc_sample = [eg.reference for eg in subbatch]
label_sample, _ = self._examples_to_truth(subbatch)
self._require_labels()

View File

@ -409,12 +409,15 @@ cdef class Parser(Pipe):
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
def initialize(self, get_examples, nlp=None):
def initialize(self, get_examples, *, nlp=None, labels=None):
self._ensure_examples(get_examples)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
langs = ", ".join(util.LEXEME_NORM_LANGS)
util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
if labels is not None:
actions = dict(labels)
else:
actions = self.moves.get_actions(
examples=get_examples(),
min_freq=self.cfg['min_action_freq'],

View File

@ -97,6 +97,9 @@ class registry(thinc.registry):
models = catalogue.create("spacy", "models", entry_points=True)
cli = catalogue.create("spacy", "cli", entry_points=True)
# We want json loading in the registry, so manually register srsly.read_json.
registry.readers("srsly.read_json.v0", srsly.read_json)
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default