diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py index 29cb23072..e675901a3 100644 --- a/spacy/cli/init_labels.py +++ b/spacy/cli/init_labels.py @@ -34,7 +34,7 @@ def init_labels_cli( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): - nlp = init_nlp(config, use_gpu=use_gpu, silent=False) + nlp = init_nlp(config, use_gpu=use_gpu) for name, component in nlp.pipeline: if getattr(component, "label_data", None) is not None: srsly.write_json(output_path / f"{name}.json", component.label_data) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b0bd48ddb..7bbfe9315 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -56,7 +56,7 @@ def train_cli( def init_pipeline( config: Config, output_path: Optional[Path], *, use_gpu: int = -1 ) -> Language: - init_kwargs = {"use_gpu": use_gpu, "silent": False} + init_kwargs = {"use_gpu": use_gpu} if output_path is not None: init_path = output_path / "model-initial" if not init_path.exists(): @@ -74,12 +74,6 @@ def init_pipeline( else: msg.good(f"Loaded initialized pipeline from {init_path}") return nlp - msg.warn( - "Not saving initialized model: no output directory specified. " - "To speed up training, spaCy can save the initialized nlp object with " - "the vocabulary, vectors and label scheme. To take advantage of this, " - "provide an output directory." - ) return init_nlp(config, **init_kwargs) diff --git a/spacy/language.py b/spacy/language.py index 7450db720..ee73faed3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1181,24 +1181,9 @@ class Language: ) doc = Doc(self.vocab, words=["x", "y", "z"]) get_examples = lambda: [Example.from_dict(doc, {})] - # Populate vocab if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="Language", obj=type(get_examples)) raise ValueError(err) - valid_examples = False - for example in get_examples(): - if not isinstance(example, Example): - err = Errors.E978.format( - name="Language.initialize", types=type(example) - ) - raise ValueError(err) - else: - valid_examples = True - for word in [t.text for t in example.reference]: - _ = self.vocab[word] # noqa: F841 - if not valid_examples: - err = Errors.E930.format(name="Language", obj="empty list") - raise ValueError(err) # Make sure the config is interpolated so we can resolve subsections config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 481430a2c..49d0bea35 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -35,10 +35,7 @@ cdef class Pipe: @property def labels(self) -> Optional[Tuple[str]]: - if "labels" in self.cfg: - return tuple(self.cfg["labels"]) - else: - return None + return [] @property def label_data(self): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 253b6f08c..f4e8ecebd 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -266,7 +266,7 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def initialize(self, get_examples, *, nlp=None): + def initialize(self, get_examples, *, nlp=None, labels=None): """Initialize the pipe for training, using a representative set of data examples. @@ -277,15 +277,19 @@ class Tagger(Pipe): DOCS: https://nightly.spacy.io/api/tagger#initialize """ self._ensure_examples(get_examples) + if labels is not None: + for tag in labels: + self.add_label(tag) + else: + tags = set() + for example in get_examples(): + for token in example.y: + if token.tag_: + tags.add(token.tag_) + for tag in sorted(tags): + self.add_label(tag) doc_sample = [] label_sample = [] - tags = set() - for example in get_examples(): - for token in example.y: - if token.tag_: - tags.add(token.tag_) - for tag in sorted(tags): - self.add_label(tag) for example in islice(get_examples(), 10): doc_sample.append(example.x) gold_tags = example.get_aligned("TAG", as_string=True) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 63b040333..d6dafa3f5 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -160,16 +160,12 @@ class TextCategorizer(Pipe): self.cfg["labels"] = tuple(value) @property - def label_data(self) -> Dict: - """RETURNS (Dict): Information about the component's labels. + def label_data(self) -> List[str]: + """RETURNS (List[str]): Information about the component's labels. DOCS: https://nightly.spacy.io/api/textcategorizer#labels """ - return { - "labels": self.labels, - "positive": self.cfg["positive_label"], - "threshold": self.cfg["threshold"] - } + return self.labels def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under @@ -354,6 +350,7 @@ class TextCategorizer(Pipe): get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language] = None, + labels: Optional[Dict] = None ): """Initialize the pipe for training, using a representative set of data examples. @@ -365,12 +362,14 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ self._ensure_examples(get_examples) - subbatch = [] # Select a subbatch of examples to initialize the model - for example in islice(get_examples(), 10): - if len(subbatch) < 2: - subbatch.append(example) - for cat in example.y.cats: - self.add_label(cat) + if labels is None: + for example in get_examples(): + for cat in example.y.cats: + self.add_label(cat) + else: + for label in labels: + self.add_label(label) + subbatch = list(islice(get_examples(), 10)) doc_sample = [eg.reference for eg in subbatch] label_sample, _ = self._examples_to_truth(subbatch) self._require_labels() diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9f165cb15..11e0e5af8 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -409,17 +409,20 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def initialize(self, get_examples, nlp=None): + def initialize(self, get_examples, *, nlp=None, labels=None): self._ensure_examples(get_examples) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: langs = ", ".join(util.LEXEME_NORM_LANGS) util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) - actions = self.moves.get_actions( - examples=get_examples(), - min_freq=self.cfg['min_action_freq'], - learn_tokens=self.cfg["learn_tokens"] - ) + if labels is not None: + actions = dict(labels) + else: + actions = self.moves.get_actions( + examples=get_examples(), + min_freq=self.cfg['min_action_freq'], + learn_tokens=self.cfg["learn_tokens"] + ) for action, labels in self.moves.labels.items(): actions.setdefault(action, {}) for label, freq in labels.items(): diff --git a/spacy/util.py b/spacy/util.py index 67c577927..948c4ab11 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -97,6 +97,9 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) +# We want json loading in the registry, so manually register srsly.read_json. +registry.readers("srsly.read_json.v0", srsly.read_json) + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default