Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

2025-10-18 09:44:16 +03:00 · 2020-09-29 16:59:35 +02:00 · 2020-09-29 16:59:35 +02:00 · 1c60f0b5e9
commit 1c60f0b5e9
parent d7469283c5 8ce9f44433
8 changed files with 39 additions and 54 deletions
--- a/spacy/cli/init_labels.py
+++ b/spacy/cli/init_labels.py
@ -34,7 +34,7 @@ def init_labels_cli(
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+        nlp = init_nlp(config, use_gpu=use_gpu)
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            srsly.write_json(output_path / f"{name}.json", component.label_data)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -56,7 +56,7 @@ def train_cli(
 def init_pipeline(
    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
 ) -> Language:
-    init_kwargs = {"use_gpu": use_gpu, "silent": False}
+    init_kwargs = {"use_gpu": use_gpu}
    if output_path is not None:
        init_path = output_path / "model-initial"
        if not init_path.exists():
@ -74,12 +74,6 @@ def init_pipeline(
            else:
                msg.good(f"Loaded initialized pipeline from {init_path}")
        return nlp
-    msg.warn(
-        "Not saving initialized model: no output directory specified. "
-        "To speed up training, spaCy can save the initialized nlp object with "
-        "the vocabulary, vectors and label scheme. To take advantage of this, "
-        "provide an output directory."
-    )
    return init_nlp(config, **init_kwargs)


--- a/spacy/language.py
+++ b/spacy/language.py
@ -1181,24 +1181,9 @@ class Language:
            )
            doc = Doc(self.vocab, words=["x", "y", "z"])
            get_examples = lambda: [Example.from_dict(doc, {})]
-        # Populate vocab
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Language", obj=type(get_examples))
            raise ValueError(err)
-        valid_examples = False
-        for example in get_examples():
-            if not isinstance(example, Example):
-                err = Errors.E978.format(
-                    name="Language.initialize", types=type(example)
-                )
-                raise ValueError(err)
-            else:
-                valid_examples = True
-            for word in [t.text for t in example.reference]:
-                _ = self.vocab[word]  # noqa: F841
-        if not valid_examples:
-            err = Errors.E930.format(name="Language", obj="empty list")
-            raise ValueError(err)
        # Make sure the config is interpolated so we can resolve subsections
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -35,10 +35,7 @@ cdef class Pipe:

    @property
    def labels(self) -> Optional[Tuple[str]]:
-        if "labels" in self.cfg:
-            return tuple(self.cfg["labels"])
-        else:
-            return None
+        return []
    
    @property
    def label_data(self):
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -266,7 +266,7 @@ class Tagger(Pipe):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def initialize(self, get_examples, *, nlp=None):
+    def initialize(self, get_examples, *, nlp=None, labels=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

@ -277,15 +277,19 @@ class Tagger(Pipe):
        DOCS: https://nightly.spacy.io/api/tagger#initialize
        """
        self._ensure_examples(get_examples)
+        if labels is not None:
+            for tag in labels:
+                self.add_label(tag)
+        else:
+            tags = set()
+            for example in get_examples():
+                for token in example.y:
+                    if token.tag_:
+                        tags.add(token.tag_)
+            for tag in sorted(tags):
+                self.add_label(tag)
        doc_sample = []
        label_sample = []
-        tags = set()
-        for example in get_examples():
-            for token in example.y:
-                if token.tag_:
-                    tags.add(token.tag_)
-        for tag in sorted(tags):
-            self.add_label(tag)
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            gold_tags = example.get_aligned("TAG", as_string=True)
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -160,16 +160,12 @@ class TextCategorizer(Pipe):
        self.cfg["labels"] = tuple(value)

    @property
-    def label_data(self) -> Dict:
-        """RETURNS (Dict): Information about the component's labels.
+    def label_data(self) -> List[str]:
+        """RETURNS (List[str]): Information about the component's labels.

        DOCS: https://nightly.spacy.io/api/textcategorizer#labels
        """
-        return {
-            "labels": self.labels,
-            "positive": self.cfg["positive_label"],
-            "threshold": self.cfg["threshold"]
-        }
+        return self.labels

    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
@ -354,6 +350,7 @@ class TextCategorizer(Pipe):
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
+        labels: Optional[Dict] = None
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.
@ -365,12 +362,14 @@ class TextCategorizer(Pipe):
        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
        """
        self._ensure_examples(get_examples)
-        subbatch = []  # Select a subbatch of examples to initialize the model
-        for example in islice(get_examples(), 10):
-            if len(subbatch) < 2:
-                subbatch.append(example)
-            for cat in example.y.cats:
-                self.add_label(cat)
+        if labels is None:
+            for example in get_examples():
+                for cat in example.y.cats:
+                    self.add_label(cat)
+        else:
+            for label in labels:
+                self.add_label(label)
+        subbatch = list(islice(get_examples(), 10))
        doc_sample = [eg.reference for eg in subbatch]
        label_sample, _ = self._examples_to_truth(subbatch)
        self._require_labels()
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -409,17 +409,20 @@ cdef class Parser(Pipe):
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)

-    def initialize(self, get_examples, nlp=None):
+    def initialize(self, get_examples, *, nlp=None, labels=None):
        self._ensure_examples(get_examples)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
            langs = ", ".join(util.LEXEME_NORM_LANGS)
            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
-        actions = self.moves.get_actions(
-            examples=get_examples(),
-            min_freq=self.cfg['min_action_freq'],
-            learn_tokens=self.cfg["learn_tokens"]
-        )
+        if labels is not None:
+            actions = dict(labels)
+        else:
+            actions = self.moves.get_actions(
+                examples=get_examples(),
+                min_freq=self.cfg['min_action_freq'],
+                learn_tokens=self.cfg["learn_tokens"]
+            )
        for action, labels in self.moves.labels.items():
            actions.setdefault(action, {})
            for label, freq in labels.items():
--- a/spacy/util.py
+++ b/spacy/util.py
@ -97,6 +97,9 @@ class registry(thinc.registry):
    models = catalogue.create("spacy", "models", entry_points=True)
    cli = catalogue.create("spacy", "cli", entry_points=True)

+# We want json loading in the registry, so manually register srsly.read_json.
+registry.readers("srsly.read_json.v0", srsly.read_json)
+

 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default