Update init config and recommendations

- As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations
- Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date.
- Simplify jinja_to_js script and use fewer dependencies
This commit is contained in:
Ines Montani 2020-08-19 13:33:15 +02:00
parent 225f8866a1
commit e2f2ef3a5a
12 changed files with 166 additions and 71 deletions

1
.gitignore vendored
View File

@ -20,6 +20,7 @@ website/logs
npm-debug.log*
website/www/
website/_deploy.sh
quickstart-training-generator.js
# Cython / C extensions
cythonize.json

View File

@ -3,17 +3,17 @@ from enum import Enum
from pathlib import Path
from wasabi import Printer, diff_strings
from thinc.api import Config
from pydantic import BaseModel
import srsly
import re
from .. import util
from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
TEMPLATE_ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
class Optimizations(str, Enum):
@ -21,21 +21,6 @@ class Optimizations(str, Enum):
accuracy = "accuracy"
class RecommendationsTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationsTrf(BaseModel):
efficiency: RecommendationsTrfItem
accuracy: RecommendationsTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationsTrf] = None
@init_cli.command("config")
def init_config_cli(
# fmt: off
@ -111,14 +96,11 @@ def init_config(
from jinja2 import Template
except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
lang_defaults = util.get_lang_class(lang).Defaults
has_letters = lang_defaults.writing_system.get("has_letters", True)
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
variables = {
"lang": lang,
"components": pipeline,
@ -126,7 +108,7 @@ def init_config(
"hardware": "cpu" if cpu else "gpu",
"transformer_data": reco["transformer"],
"word_vectors": reco["word_vectors"],
"has_letters": has_letters,
"has_letters": reco["has_letters"],
}
base_template = template.render(variables).strip()
# Giving up on getting the newlines right in jinja for now

View File

@ -1,13 +0,0 @@
{
"en": {
"word_vectors": "en_vectors_web_lg",
"transformer": {
"efficiency": { "name": "roberta-base", "size_factor": 3 },
"accuracy": { "name": "roberta-base", "size_factor": 3 }
}
},
"de": {
"word_vectors": null,
"transformer": null
}
}

View File

@ -0,0 +1,103 @@
# Recommended settings and available resources for each language, if available.
# Not all languages have recommended word vecotrs or transformers and for some,
# the recommended transformer for efficiency and accuracy may be the same.
en:
word_vectors: en_vectors_web_lg
transformer:
efficiency:
name: roberta-base
size_factor: 3
accuracy:
name: roberta-base
size_factor: 3
de:
word_vectors: null
transformer:
efficiency:
name: bert-base-german-cased
size_factor: 3
accuracy:
name: bert-base-german-cased
size_factor: 3
fr:
word_vectors: null
transformer:
efficiency:
name: camembert-base
size_factor: 3
accuracy:
name: camembert-base
size_factor: 3
es:
word_vectors: null
transformer:
efficiency:
name: mrm8488/RuPERTa-base
size_factor: 3
accuracy:
name: mrm8488/RuPERTa-base
size_factor: 3
sv:
word_vectors: null
transformer:
efficiency:
name: KB/bert-base-swedish-cased
size_factor: 3
accuracy:
name: KB/bert-base-swedish-cased
size_factor: 3
fi:
word_vectors: null
transformer:
efficiency:
name: TurkuNLP/bert-base-finnish-cased-v1
size_factor: 3
accuracy:
name: TurkuNLP/bert-base-finnish-cased-v1
size_factor: 3
el:
word_vectors: null
transformer:
efficiency:
name: nlpaueb/bert-base-greek-uncased-v1
size_factor: 3
accuracy:
name: nlpaueb/bert-base-greek-uncased-v1
size_factor: 3
tr:
word_vectors: null
transformer:
efficiency:
name: dbmdz/bert-base-turkish-cased
size_factor: 3
accuracy:
name: dbmdz/bert-base-turkish-cased
size_factor: 3
zh:
word_vectors: null
transformer:
efficiency:
name: bert-base-chinese
size_factor: 3
accuracy:
name: bert-base-chinese
size_factor: 3
has_letters: false
ar:
word_vectors: null
transformer:
efficiency:
name: asafaya/bert-base-arabic
size_factor: 3
accuracy:
name: asafaya/bert-base-arabic
size_factor: 3
pl:
word_vectors: null
transformer:
efficiency:
name: dkleczek/bert-base-polish-cased-v1
size_factor: 3
accuracy:
name: dkleczek/bert-base-polish-cased-v1
size_factor: 3

View File

@ -311,3 +311,22 @@ class ProjectConfigSchema(BaseModel):
class Config:
title = "Schema for project configuration file"
# Recommendations for init config workflows
class RecommendationTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationTrf(BaseModel):
efficiency: RecommendationTrfItem
accuracy: RecommendationTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationTrf] = None
has_letters: bool = True

View File

@ -2,10 +2,9 @@ import pytest
from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, validate
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
from spacy.cli.init_config import RecommendationSchema
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.util import get_lang_class
import srsly
@ -335,7 +334,5 @@ def test_init_config(lang, pipeline, optimize):
def test_model_recommendations():
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
for lang, data in recommendations.items():
assert get_lang_class(lang)
for lang, data in RECOMMENDATIONS.items():
assert RecommendationSchema(**data)

View File

@ -53,7 +53,7 @@
"remark-react": "^5.0.1"
},
"scripts": {
"build": "npm run python:setup && gatsby build",
"build": "npm run python:install && npm run python:setup && gatsby build",
"dev": "npm run python:setup && gatsby develop",
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
"lint": "eslint **",

View File

@ -11,7 +11,8 @@ from os import path
from io import StringIO
from jinja2 import Environment, FileSystemLoader, nodes
from pathlib import Path
import typer
import srsly
import sys
OPERANDS = {
@ -437,7 +438,8 @@ class JinjaToJS(object):
with self._interpolation():
with self._python_bool_wrapper(**kwargs):
if node.items:
raise ValueError(f"Can't process non-empty dict in epxression: {node}")
err = f"Can't process non-empty dict in expression: {node}"
raise ValueError(err)
self.output.write("{}")
def _process_getattr(self, node, **kwargs):
@ -1232,18 +1234,22 @@ class JinjaToJS(object):
self.output.write(")")
def main(
# fmt: off
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"),
data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA")
# fmt: on
):
"""Convert a jinja2 template to a JavaScript module."""
def main(template_path, output=None, data_path=None):
"""Convert a jinja2 template to a JavaScript module.
template_path (Path): Path to .jijna file.
output (Optional[Path]): Path to output .js module (stdout if unset).
data_path (Optional[Path]): Optional JSON or YAML file with additional data
to be included in the JS module as the exported variable DATA.
"""
data = "{}"
if data_path is not None:
with data_path.open("r", encoding="utf8") as f:
data = json.dumps(json.loads(f.read())) # dump and load for compactness
if data_path.suffix in (".yml", ".yaml"):
data = srsly.read_yaml(data_path)
else:
data = srsly.read_json(data_path)
data = srsly.json_dumps(data) # dump and load for compactness
template_path = Path(template_path)
tpl_file = template_path.parts[-1]
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
@ -1258,4 +1264,10 @@ def main(
if __name__ == "__main__":
typer.run(main)
args = sys.argv[1:]
if not len(args):
raise ValueError("Need at least one argument: path to .jinja template")
template_path = Path(args[0])
output = Path(args[1]) if len(args) > 1 else None
data_path = Path(args[2]) if len(args) > 2 else None
main(template_path, output, data_path)

View File

@ -1,3 +1,3 @@
# These are used to compile the training quickstart config
jinja2
typer
srsly

View File

@ -1 +1 @@
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js ../../spacy/cli/templates/quickstart_training_recommendations.yml

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@ import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
import { Quickstart } from '../components/quickstart'
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
import { isString, htmlToReact } from '../components/util'
import { htmlToReact } from '../components/util'
const DEFAULT_LANG = 'en'
const DEFAULT_HARDWARE = 'gpu'
@ -47,13 +47,6 @@ const DATA = [
},
]
function stringify(value) {
if (isString(value) && value.startsWith('${')) return value
const string = JSON.stringify(value)
if (Array.isArray(value)) return string.replace(/,/g, ', ')
return string
}
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
const [lang, setLang] = useState(DEFAULT_LANG)
const [components, setComponents] = useState([])
@ -73,6 +66,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
hardware,
transformer_data: reco.transformer,
word_vectors: reco.word_vectors,
has_letters: reco.has_letters,
})
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
const rawContent = `${COMMENT}\n${rawStr}`
@ -90,7 +84,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
id: code,
title: name,
}))
.sort((a, b) => a.id.localeCompare(b.id))
.sort((a, b) => a.title.localeCompare(b.title))
return (
<Quickstart
download={download}