Update quickstart, template and docs

This commit is contained in:
Ines Montani 2020-08-15 14:50:29 +02:00
parent daba316930
commit a570c304df
14 changed files with 236 additions and 86 deletions

View File

@ -3,6 +3,7 @@ from enum import Enum
from pathlib import Path from pathlib import Path
from wasabi import Printer, diff_strings from wasabi import Printer, diff_strings
from thinc.api import Config from thinc.api import Config
from pydantic import BaseModel
import srsly import srsly
import re import re
@ -10,7 +11,9 @@ from .. import util
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
TEMPLATE_PATH = Path(__file__).parent / "templates" / "quickstart_training.jinja" TEMPLATE_ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
class Optimizations(str, Enum): class Optimizations(str, Enum):
@ -18,6 +21,21 @@ class Optimizations(str, Enum):
accuracy = "accuracy" accuracy = "accuracy"
class RecommendationsTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationsTrf(BaseModel):
efficiency: RecommendationsTrfItem
accuracy: RecommendationsTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationsTrf] = None
@init_cli.command("config") @init_cli.command("config")
def init_config_cli( def init_config_cli(
# fmt: off # fmt: off
@ -89,41 +107,49 @@ def init_config(
from jinja2 import Template from jinja2 import Template
except ImportError: except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1) msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
lang_defaults = util.get_lang_class(lang).Defaults lang_defaults = util.get_lang_class(lang).Defaults
has_letters = lang_defaults.writing_system.get("has_letters", True) has_letters = lang_defaults.writing_system.get("has_letters", True)
has_transformer = False # TODO: check this somehow # Filter out duplicates since tok2vec and transformer are added by template
if has_transformer: pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
require_spacy_transformers(msg) reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
with TEMPLATE_PATH.open("r") as f: with TEMPLATE_PATH.open("r") as f:
template = Template(f.read()) template = Template(f.read())
variables = { variables = {
"lang": lang, "lang": lang,
"pipeline": srsly.json_dumps(pipeline).replace(",", ", "),
"components": pipeline, "components": pipeline,
"optimize": optimize, "optimize": optimize,
"hardware": "cpu" if cpu else "gpu", "hardware": "cpu" if cpu else "gpu",
"has_transformer": has_transformer, "transformer_data": reco["transformer"],
"word_vectors": reco["word_vectors"],
"has_letters": has_letters, "has_letters": has_letters,
} }
base_template = template.render(**variables).strip() base_template = template.render(variables).strip()
# Giving up on getting the newlines right in jinja for now # Giving up on getting the newlines right in jinja for now
base_template = re.sub(r"\n\n\n+", "\n\n", base_template) base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
# Access variables declared in templates
template_vars = template.make_module(variables)
use_case = { use_case = {
"Language": lang, "Language": lang,
"Pipeline": ", ".join(pipeline), "Pipeline": ", ".join(pipeline),
"Optimize for": optimize, "Optimize for": optimize,
"Hardware": variables["hardware"].upper(), "Hardware": variables["hardware"].upper(),
"Transformer": template_vars.transformer.get("name", False),
} }
msg.good("Generated template specific for your use case:") msg.info("Generated template specific for your use case")
for label, value in use_case.items(): for label, value in use_case.items():
msg.text(f"- {label}: {value}") msg.text(f"- {label}: {value}")
use_transformer = bool(template_vars.use_transformer)
if use_transformer:
require_spacy_transformers(msg)
with show_validation_error(hint_fill=False): with show_validation_error(hint_fill=False):
with msg.loading("Auto-filling config..."): config = util.load_config_from_str(base_template)
config = util.load_config_from_str(base_template) try:
try: nlp, _ = util.load_model_from_config(config, auto_fill=True)
nlp, _ = util.load_model_from_config(config, auto_fill=True) except ValueError as e:
except ValueError as e: msg.fail(str(e), exits=1)
msg.fail(str(e), exits=1) if use_transformer:
nlp.config.pop("pretraining", {}) # TODO: solve this better
msg.good("Auto-filled config with all values") msg.good("Auto-filled config with all values")
save_config(nlp.config, output_file, is_stdout=is_stdout) save_config(nlp.config, output_file, is_stdout=is_stdout)

View File

@ -1,27 +1,31 @@
{# This is a template for training configs used for the quickstart widget in {# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
# This is an auto-generated config for training a model with 'spacy train' {%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths] [paths]
train = "" train = ""
dev = "" dev = ""
[system]
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
[nlp] [nlp]
lang = "{{ lang }}" lang = "{{ lang }}"
pipeline = {{ pipeline|safe }} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"} tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
[components] [components]
{# TRANSFORMER PIPELINE #} {# TRANSFORMER PIPELINE #}
{%- if has_transformer -%} {%- if use_transformer -%}
[components.transformer] [components.transformer]
factory = "transformer" factory = "transformer"
[components.transformer.model] [components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1" @architectures = "spacy-transformers.TransformerModel.v1"
{#- name = {{ transformer_info["name"] }} #} name = "{{ transformer["name"] }}"
name = "roberta-base"
tokenizer_config = {"use_fast": true} tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans] [components.transformer.model.get_spans]
@ -38,7 +42,7 @@ factory = "tagger"
nO = null nO = null
[components.tagger.model.tok2vec] [components.tagger.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1" @architectures = "spacy-transformers.Tok2VecListener.v1"
grad_factor = 1.0 grad_factor = 1.0
[components.tagger.model.tok2vec.pooling] [components.tagger.model.tok2vec.pooling]
@ -58,7 +62,7 @@ use_upper = false
nO = null nO = null
[components.parser.model.tok2vec] [components.parser.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1" @architectures = "spacy-transformers.Tok2VecListener.v1"
grad_factor = 1.0 grad_factor = 1.0
[components.parser.model.tok2vec.pooling] [components.parser.model.tok2vec.pooling]
@ -78,7 +82,7 @@ use_upper = false
nO = null nO = null
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1" @architectures = "spacy-transformers.Tok2VecListener.v1"
grad_factor = 1.0 grad_factor = 1.0
[components.ner.model.tok2vec.pooling] [components.ner.model.tok2vec.pooling]
@ -170,12 +174,14 @@ factory = "{{ pipe }}"
{% endfor %} {% endfor %}
[training] [training]
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" and not has_transformer else false)|safe }} {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% if has_transformer -%} {% else -%}
{#- accumulate_gradient = {{ transformer_info["size_factor"] }} #} vectors = "{{ word_vectors }}"
accumulate_gradient = 3
{% endif -%} {% endif -%}
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
{% endif %}
[training.optimizer] [training.optimizer]
@optimizers = "Adam.v1" @optimizers = "Adam.v1"
@ -196,7 +202,7 @@ max_length = {{ 500 if hardware == "gpu" else 0 }}
path = ${paths:dev} path = ${paths:dev}
max_length = 0 max_length = 0
{% if has_transformer %} {% if use_transformer %}
[training.batcher] [training.batcher]
@batchers = "batch_by_padded.v1" @batchers = "batch_by_padded.v1"
discard_oversize = true discard_oversize = true

View File

@ -0,0 +1,13 @@
{
"en": {
"word_vectors": "en_vectors_web_lg",
"transformer": {
"efficiency": { "name": "roberta-base", "size_factor": 3 },
"accuracy": { "name": "roberta-base", "size_factor": 3 }
}
},
"de": {
"word_vectors": null,
"transformer": null
}
}

View File

@ -1,12 +1,14 @@
import pytest import pytest
from spacy.gold import docs_to_json, biluo_tags_from_offsets from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, validate from spacy.schemas import ProjectConfigSchema, validate
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
from spacy.cli.init_config import RecommendationSchema
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.util import get_lang_class
import srsly
def test_cli_converters_conllu2json(): def test_cli_converters_conllu2json():
@ -330,3 +332,10 @@ def test_parse_config_overrides_invalid(args):
def test_init_config(lang, pipeline, optimize): def test_init_config(lang, pipeline, optimize):
# TODO: add more tests and also check for GPU with transformers # TODO: add more tests and also check for GPU with transformers
init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True) init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True)
def test_model_recommendations():
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
for lang, data in recommendations.items():
assert get_lang_class(lang)
assert RecommendationSchema(**data)

View File

@ -101,39 +101,62 @@ files and model directories.
### init config {#init-config new="3"} ### init config {#init-config new="3"}
Initialize and export a [`config.cfg` file](/usage/training#config) for training Initialize and save a [`config.cfg` file](/usage/training#config) using the
and update it with all default values, if possible. Config files used for **recommended settings** for your use case. It works just like the
training should always be complete and not contain any hidden defaults or [quickstart widget](/usage/training#quickstart), only that it also auto-fills
missing values, so this command helps you create your final config. It takes all default values and exports a [training](/usage/training#config)-ready
**one** of the following options: config. The settings you specify will impact the suggested model architectures
and pipeline setup, as well as the hyperparameters. You can also adjust and
- `--base`: Base **config** to auto-fill, e.g. created using the customize those settings in your config file later.
[training quickstart](/usage/training#quickstart) widget.
- `--lang`: Base **language** code to use for blank config.
- `--model`: Base **model** to copy config from.
> ```bash > ```bash
> ### with base config {wrap="true"} > ### Example {wrap="true"}
> $ python -m spacy init config config.cfg --base base.cfg > $ python -m spacy init config config.cfg --lang en --pipeline ner,textcat --optimize accuracy
> ```
>
> ```bash
> ### blank language {wrap="true"}
> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
> ``` > ```
```bash ```bash
$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline] $ python -m spacy init config [output_file] [--lang] [--pipeline]
[--optimize] [--cpu]
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. | | `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. | | `--lang`, `-l` | option | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. |
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. | | `--pipeline`, `-p` | option | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. |
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. | | `--optimize`, `-o` | option | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. |
| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. | | `--cpu`, `-C` | flag | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. |
| **CREATES** | config | Complete and auto-filled config file for training. | | `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | file | The config file for training. |
### init fill-config {#init-fill-config new="3"}
Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all
default values**, e.g. a config generated with the
[quickstart widget](/usage/training#quickstart). Config files used for training
should always be complete and not contain any hidden defaults or missing values,
so this command helps you create your final training config. In order to find
the available settings and defaults, all functions referenced in the config will
be created, and their signatures are used to find the defaults. If your config
contains a problem that can't be resolved automatically, spaCy will show you a
validation error with more details.
> ```bash
> ### Example {wrap="true"}
> $ python -m spacy init fill-config base.cfg config.cfg
> ```
```bash
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
```
| Argument | Type | Description |
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------- |
| `base_path` | positional | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). |
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
| `--diff`, `-D` | flag | Print a visual diff highlighting the changes. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | file | Complete and auto-filled config file for training. |
### init model {#init-model new="2"} ### init model {#init-model new="2"}

View File

@ -20,8 +20,9 @@ Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use [`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the hood. For details on how to use training configs, see the
[usage documentation](/usage/training#config). To get started with a blank [usage documentation](/usage/training#config). To get started with the
config or fill a partial config with all defaults, you can use the recommended settings for your use case, check out the
[quickstart widget](/usage/training#quickstart) or run the
[`init config`](/api/cli#init-config) command. [`init config`](/api/cli#init-config) command.
> #### What does the @ mean? > #### What does the @ mean?

View File

@ -37,27 +37,37 @@ The recommended way to train your spaCy models is via the
single [`config.cfg`](#config) **configuration file** that includes all settings single [`config.cfg`](#config) **configuration file** that includes all settings
and hyperparameters. You can optionally [overwritten](#config-overrides) and hyperparameters. You can optionally [overwritten](#config-overrides)
settings on the command line, and load in a Python file to register settings on the command line, and load in a Python file to register
[custom functions](#custom-code) and architectures. [custom functions](#custom-code) and architectures. This quickstart widget helps
you generate a starter config with the **recommended settings** for your
specific use case. It's also available in spaCy as the
[`init config`](/api/cli#init-config) command.
> #### Instructions > #### Instructions: widget
> >
> 1. Select your requirements and settings. > 1. Select your requirements and settings.
> 2. Use the buttons at the bottom to save the result to your clipboard or a > 2. Use the buttons at the bottom to save the result to your clipboard or a
> file `base_config.cfg`. > file `base_config.cfg`.
> 3. Run [`init config`](/api/cli#init-config) to create a full training config. > 3. Run [`init fill-config`](/api/cli#init-fill-config) to create a full
> config.
> 4. Run [`train`](/api/cli#train) with your config and data. > 4. Run [`train`](/api/cli#train) with your config and data.
>
> #### Instructions: CLI
>
> 1. Run the [`init config`](/api/cli#init-config) command and specify your
> requirements and settings as CLI arguments.
> 2. Run [`train`](/api/cli#train) with the exported config and data.
import QuickstartTraining from 'widgets/quickstart-training.js' import QuickstartTraining from 'widgets/quickstart-training.js'
<QuickstartTraining download="base_config.cfg" /> <QuickstartTraining download="base_config.cfg" />
After you've saved the starter config to a file `base_config.cfg`, you can use After you've saved the starter config to a file `base_config.cfg`, you can use
the [`init config`](/api/cli#init-config) command to fill in the remaining the [`init fill-config`](/api/cli#init-fill-config) command to fill in the
defaults. Training configs should always be **complete and without hidden remaining defaults. Training configs should always be **complete and without
defaults**, to keep your experiments reproducible. hidden defaults**, to keep your experiments reproducible.
```bash ```bash
$ python -m spacy init config config.cfg --base base_config.cfg $ python -m spacy init fill-config base_config.cfg config.cfg
``` ```
> #### Tip: Debug your data > #### Tip: Debug your data
@ -70,10 +80,13 @@ $ python -m spacy init config config.cfg --base base_config.cfg
> $ python -m spacy debug data config.cfg --verbose > $ python -m spacy debug data config.cfg --verbose
> ``` > ```
You can now add your data and run [`train`](/api/cli#train) with your config. Instead of exporting your starter config from the quickstart widget and
See the [`convert`](/api/cli#convert) command for details on how to convert your auto-filling it, you can also use the [`init config`](/api/cli#init-config)
data to spaCy's binary `.spacy` format. You can either include the data paths in command and specify your requirement and settings and CLI arguments. You can now
the `[paths]` section of your config, or pass them in via the command line. add your data and run [`train`](/api/cli#train) with your config. See the
[`convert`](/api/cli#convert) command for details on how to convert your data to
spaCy's binary `.spacy` format. You can either include the data paths in the
`[paths]` section of your config, or pass them in via the command line.
```bash ```bash
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
@ -601,7 +614,7 @@ settings in the block will be passed to the function as keyword arguments. Keep
in mind that the config shouldn't have any hidden defaults and all arguments on in mind that the config shouldn't have any hidden defaults and all arguments on
the functions need to be represented in the config. If your function defines the functions need to be represented in the config. If your function defines
**default argument values**, spaCy is able to auto-fill your config when you run **default argument values**, spaCy is able to auto-fill your config when you run
[`init config`](/api/cli#init-config). [`init fill-config`](/api/cli#init-fill-config).
```ini ```ini
### config.cfg (excerpt) ### config.cfg (excerpt)

View File

@ -163,8 +163,9 @@ resolved, the function is created and passed into the model as an argument.
Remember that the `config.cfg` used for training should contain **no missing Remember that the `config.cfg` used for training should contain **no missing
values** and requires all settings to be defined. You don't want any hidden values** and requires all settings to be defined. You don't want any hidden
defaults creeping in and changing your results! spaCy will tell you if settings defaults creeping in and changing your results! spaCy will tell you if settings
are missing, and you can run [`spacy init config`](/api/cli#init-config) with to are missing, and you can run
automatically fill in all defaults. [`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
all defaults.
</Infobox> </Infobox>

View File

@ -152,7 +152,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | | [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | | [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file for a blank language or auto-filling a partial config. | | [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file with the recommended settings. |
| [`init fill-config`](/api/cli#init-fill-config) | CLI command for auto-filling a partial config with all defaults and missing values. |
| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. | | [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |

View File

@ -1,4 +1,6 @@
# Forked from: https://github.com/jonbretman/jinja-to-js # Forked from: https://github.com/jonbretman/jinja-to-js
# With additional functionality: in/not in, replace, pprint, round, + for lists,
# rendering empty dicts
# This script is mostly used to generate the JavaScript function for the # This script is mostly used to generate the JavaScript function for the
# training quicktart widget. # training quicktart widget.
import contextlib import contextlib
@ -315,7 +317,7 @@ class JinjaToJS(object):
if callable(handler): if callable(handler):
handler(node, **kwargs) handler(node, **kwargs)
else: else:
raise Exception("Unknown node %s" % node) raise Exception(f"Unknown node {node} ({node_name})")
def _process_extends(self, node, **kwargs): def _process_extends(self, node, **kwargs):
""" """
@ -431,6 +433,13 @@ class JinjaToJS(object):
self.output.write(node.name) self.output.write(node.name)
def _process_dict(self, node, **kwargs):
with self._interpolation():
with self._python_bool_wrapper(**kwargs):
if node.items:
raise ValueError(f"Can't process non-empty dict in epxression: {node}")
self.output.write("{}")
def _process_getattr(self, node, **kwargs): def _process_getattr(self, node, **kwargs):
""" """
Processes a `GetAttr` node. e.g. {{ foo.bar }} Processes a `GetAttr` node. e.g. {{ foo.bar }}
@ -697,6 +706,27 @@ class JinjaToJS(object):
self._process_node(node.node, **new_kwargs) self._process_node(node.node, **new_kwargs)
self.output.write(")") self.output.write(")")
def _process_filter_replace(self, node, **kwargs):
# We're getting a quoted string from Python/Jinja as the pattern to
# replace, but to replace all occurrences in JS, we typically need a
# regex, which would be annoying to convert. So we're using split/join
# instead here.
with self._interpolation():
with self._python_bool_wrapper(**kwargs) as new_kwargs:
self._process_node(node.node, **new_kwargs)
self.output.write(".split(")
self._process_node(node.args[0], **new_kwargs)
self.output.write(").join(")
self._process_node(node.args[1], **new_kwargs)
self.output.write(")")
def _process_filter_pprint(self, node, **kwargs):
with self._interpolation():
with self._python_bool_wrapper(**kwargs) as new_kwargs:
self.output.write("JSON.stringify(")
self._process_node(node.node, **new_kwargs)
self.output.write(")")
def _process_filter_attr(self, node, **kwargs): def _process_filter_attr(self, node, **kwargs):
with self._interpolation(): with self._interpolation():
with self._python_bool_wrapper(**kwargs) as new_kwargs: with self._python_bool_wrapper(**kwargs) as new_kwargs:
@ -746,7 +776,10 @@ class JinjaToJS(object):
with self._python_bool_wrapper(**kwargs) as new_kwargs: with self._python_bool_wrapper(**kwargs) as new_kwargs:
self.output.write("Math.round((") self.output.write("Math.round((")
self._process_node(node.node, **new_kwargs) self._process_node(node.node, **new_kwargs)
self.output.write("+ Number.EPSILON) * 100) / 100") self.output.write("+ Number.EPSILON) * 10**")
self._process_node(node.args[0], **new_kwargs)
self.output.write(") / 10**")
self._process_node(node.args[0], **new_kwargs)
def _process_filter_last(self, node, **kwargs): def _process_filter_last(self, node, **kwargs):
with self._interpolation(): with self._interpolation():
@ -1029,7 +1062,18 @@ class JinjaToJS(object):
self.output.write(")") self.output.write(")")
def _process_add(self, node, **kwargs): def _process_add(self, node, **kwargs):
self._process_math(node, math_operator=" + ", **kwargs) # Handle + operator for lists, which behaves differently in JS. Currently
# only works if we have an explicit list node on either side (in which
# case we assume both are lists).
if isinstance(node.left, nodes.List) or isinstance(node.right, nodes.List):
with self._interpolation():
with self._python_bool_wrapper(**kwargs) as new_kwargs:
self._process_node(node.left, **new_kwargs)
self.output.write(".concat(")
self._process_node(node.right, **new_kwargs)
self.output.write(")")
else:
self._process_math(node, math_operator=" + ", **kwargs)
def _process_sub(self, node, **kwargs): def _process_sub(self, node, **kwargs):
self._process_math(node, math_operator=" - ", **kwargs) self._process_math(node, math_operator=" - ", **kwargs)
@ -1192,16 +1236,22 @@ def main(
# fmt: off # fmt: off
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"), template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"), output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"),
data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA")
# fmt: on # fmt: on
): ):
"""Convert a jinja2 template to a JavaScript module.""" """Convert a jinja2 template to a JavaScript module."""
data = "{}"
if data_path is not None:
with data_path.open("r", encoding="utf8") as f:
data = json.dumps(json.loads(f.read())) # dump and load for compactness
tpl_file = template_path.parts[-1] tpl_file = template_path.parts[-1]
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6") compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
header = f"// This file was auto-generated by {__file__} based on {tpl_file}" header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
data_str = f"export const DATA = {data}"
result = compiler.get_output() result = compiler.get_output()
if output is not None: if output is not None:
with output.open("w") as f: with output.open("w") as f:
f.write(f"{header}\n{result}") f.write(f"{header}\n{result}\n{data_str}")
print(f"Updated {output.parts[-1]}") print(f"Updated {output.parts[-1]}")
else: else:
print(result) print(result)

View File

@ -1 +1 @@
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json

View File

@ -125,9 +125,9 @@
display: block display: block
.small .small
font-size: var(--font-size-sm) font-size: var(--font-size-code)
line-height: 1.65 line-height: 1.65
white-space: pre white-space: pre-wrap
max-height: 400px max-height: 400px
overflow-y: auto overflow-y: auto

File diff suppressed because one or more lines are too long

View File

@ -2,14 +2,17 @@ import React, { useState } from 'react'
import { StaticQuery, graphql } from 'gatsby' import { StaticQuery, graphql } from 'gatsby'
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js' import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
import { Quickstart, QS } from '../components/quickstart' import { Quickstart } from '../components/quickstart'
import generator from './quickstart-training-generator' import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
import { isString, htmlToReact } from '../components/util' import { isString, htmlToReact } from '../components/util'
const DEFAULT_LANG = 'en' const DEFAULT_LANG = 'en'
const DEFAULT_HARDWARE = 'gpu' const DEFAULT_HARDWARE = 'gpu'
const DEFAULT_OPT = 'efficiency' const DEFAULT_OPT = 'efficiency'
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat'] const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg`
const DATA = [ const DATA = [
{ {
@ -61,14 +64,17 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
hardware: setHardware, hardware: setHardware,
optimize: setOptimize, optimize: setOptimize,
} }
const reco = GENERATOR_DATA[lang] || {}
const content = generator({ const content = generator({
lang, lang,
pipeline: stringify(components),
components, components,
optimize, optimize,
hardware, hardware,
transformer_data: reco.transformer,
word_vectors: reco.word_vectors,
}) })
const rawContent = content.trim().replace(/\n\n\n+/g, '\n\n') const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
const rawContent = `${COMMENT}\n${rawStr}`
const displayContent = highlightCode('ini', rawContent) const displayContent = highlightCode('ini', rawContent)
.split('\n') .split('\n')
.map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line)) .map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))