Update init config and recommendations

- As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations
- Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date.
- Simplify jinja_to_js script and use fewer dependencies
This commit is contained in:
Ines Montani 2020-08-19 13:33:15 +02:00
parent 225f8866a1
commit e2f2ef3a5a
12 changed files with 166 additions and 71 deletions

1
.gitignore vendored
View File

@ -20,6 +20,7 @@ website/logs
npm-debug.log* npm-debug.log*
website/www/ website/www/
website/_deploy.sh website/_deploy.sh
quickstart-training-generator.js
# Cython / C extensions # Cython / C extensions
cythonize.json cythonize.json

View File

@ -3,17 +3,17 @@ from enum import Enum
from pathlib import Path from pathlib import Path
from wasabi import Printer, diff_strings from wasabi import Printer, diff_strings
from thinc.api import Config from thinc.api import Config
from pydantic import BaseModel
import srsly import srsly
import re import re
from .. import util from .. import util
from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
TEMPLATE_ROOT = Path(__file__).parent / "templates" ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja" TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json" RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
class Optimizations(str, Enum): class Optimizations(str, Enum):
@ -21,21 +21,6 @@ class Optimizations(str, Enum):
accuracy = "accuracy" accuracy = "accuracy"
class RecommendationsTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationsTrf(BaseModel):
efficiency: RecommendationsTrfItem
accuracy: RecommendationsTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationsTrf] = None
@init_cli.command("config") @init_cli.command("config")
def init_config_cli( def init_config_cli(
# fmt: off # fmt: off
@ -111,14 +96,11 @@ def init_config(
from jinja2 import Template from jinja2 import Template
except ImportError: except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1) msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
lang_defaults = util.get_lang_class(lang).Defaults
has_letters = lang_defaults.writing_system.get("has_letters", True)
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
with TEMPLATE_PATH.open("r") as f: with TEMPLATE_PATH.open("r") as f:
template = Template(f.read()) template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
variables = { variables = {
"lang": lang, "lang": lang,
"components": pipeline, "components": pipeline,
@ -126,7 +108,7 @@ def init_config(
"hardware": "cpu" if cpu else "gpu", "hardware": "cpu" if cpu else "gpu",
"transformer_data": reco["transformer"], "transformer_data": reco["transformer"],
"word_vectors": reco["word_vectors"], "word_vectors": reco["word_vectors"],
"has_letters": has_letters, "has_letters": reco["has_letters"],
} }
base_template = template.render(variables).strip() base_template = template.render(variables).strip()
# Giving up on getting the newlines right in jinja for now # Giving up on getting the newlines right in jinja for now

View File

@ -1,13 +0,0 @@
{
"en": {
"word_vectors": "en_vectors_web_lg",
"transformer": {
"efficiency": { "name": "roberta-base", "size_factor": 3 },
"accuracy": { "name": "roberta-base", "size_factor": 3 }
}
},
"de": {
"word_vectors": null,
"transformer": null
}
}

View File

@ -0,0 +1,103 @@
# Recommended settings and available resources for each language, if available.
# Not all languages have recommended word vecotrs or transformers and for some,
# the recommended transformer for efficiency and accuracy may be the same.
en:
word_vectors: en_vectors_web_lg
transformer:
efficiency:
name: roberta-base
size_factor: 3
accuracy:
name: roberta-base
size_factor: 3
de:
word_vectors: null
transformer:
efficiency:
name: bert-base-german-cased
size_factor: 3
accuracy:
name: bert-base-german-cased
size_factor: 3
fr:
word_vectors: null
transformer:
efficiency:
name: camembert-base
size_factor: 3
accuracy:
name: camembert-base
size_factor: 3
es:
word_vectors: null
transformer:
efficiency:
name: mrm8488/RuPERTa-base
size_factor: 3
accuracy:
name: mrm8488/RuPERTa-base
size_factor: 3
sv:
word_vectors: null
transformer:
efficiency:
name: KB/bert-base-swedish-cased
size_factor: 3
accuracy:
name: KB/bert-base-swedish-cased
size_factor: 3
fi:
word_vectors: null
transformer:
efficiency:
name: TurkuNLP/bert-base-finnish-cased-v1
size_factor: 3
accuracy:
name: TurkuNLP/bert-base-finnish-cased-v1
size_factor: 3
el:
word_vectors: null
transformer:
efficiency:
name: nlpaueb/bert-base-greek-uncased-v1
size_factor: 3
accuracy:
name: nlpaueb/bert-base-greek-uncased-v1
size_factor: 3
tr:
word_vectors: null
transformer:
efficiency:
name: dbmdz/bert-base-turkish-cased
size_factor: 3
accuracy:
name: dbmdz/bert-base-turkish-cased
size_factor: 3
zh:
word_vectors: null
transformer:
efficiency:
name: bert-base-chinese
size_factor: 3
accuracy:
name: bert-base-chinese
size_factor: 3
has_letters: false
ar:
word_vectors: null
transformer:
efficiency:
name: asafaya/bert-base-arabic
size_factor: 3
accuracy:
name: asafaya/bert-base-arabic
size_factor: 3
pl:
word_vectors: null
transformer:
efficiency:
name: dkleczek/bert-base-polish-cased-v1
size_factor: 3
accuracy:
name: dkleczek/bert-base-polish-cased-v1
size_factor: 3

View File

@ -311,3 +311,22 @@ class ProjectConfigSchema(BaseModel):
class Config: class Config:
title = "Schema for project configuration file" title = "Schema for project configuration file"
# Recommendations for init config workflows
class RecommendationTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationTrf(BaseModel):
efficiency: RecommendationTrfItem
accuracy: RecommendationTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationTrf] = None
has_letters: bool = True

View File

@ -2,10 +2,9 @@ import pytest
from spacy.gold import docs_to_json, biluo_tags_from_offsets from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli.init_config import RecommendationSchema
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.util import get_lang_class from spacy.util import get_lang_class
import srsly import srsly
@ -335,7 +334,5 @@ def test_init_config(lang, pipeline, optimize):
def test_model_recommendations(): def test_model_recommendations():
recommendations = srsly.read_json(RECOMMENDATIONS_PATH) for lang, data in RECOMMENDATIONS.items():
for lang, data in recommendations.items():
assert get_lang_class(lang)
assert RecommendationSchema(**data) assert RecommendationSchema(**data)

View File

@ -53,7 +53,7 @@
"remark-react": "^5.0.1" "remark-react": "^5.0.1"
}, },
"scripts": { "scripts": {
"build": "npm run python:setup && gatsby build", "build": "npm run python:install && npm run python:setup && gatsby build",
"dev": "npm run python:setup && gatsby develop", "dev": "npm run python:setup && gatsby develop",
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev", "dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
"lint": "eslint **", "lint": "eslint **",

View File

@ -11,7 +11,8 @@ from os import path
from io import StringIO from io import StringIO
from jinja2 import Environment, FileSystemLoader, nodes from jinja2 import Environment, FileSystemLoader, nodes
from pathlib import Path from pathlib import Path
import typer import srsly
import sys
OPERANDS = { OPERANDS = {
@ -437,7 +438,8 @@ class JinjaToJS(object):
with self._interpolation(): with self._interpolation():
with self._python_bool_wrapper(**kwargs): with self._python_bool_wrapper(**kwargs):
if node.items: if node.items:
raise ValueError(f"Can't process non-empty dict in epxression: {node}") err = f"Can't process non-empty dict in expression: {node}"
raise ValueError(err)
self.output.write("{}") self.output.write("{}")
def _process_getattr(self, node, **kwargs): def _process_getattr(self, node, **kwargs):
@ -1232,18 +1234,22 @@ class JinjaToJS(object):
self.output.write(")") self.output.write(")")
def main( def main(template_path, output=None, data_path=None):
# fmt: off """Convert a jinja2 template to a JavaScript module.
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"), template_path (Path): Path to .jijna file.
data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA") output (Optional[Path]): Path to output .js module (stdout if unset).
# fmt: on data_path (Optional[Path]): Optional JSON or YAML file with additional data
): to be included in the JS module as the exported variable DATA.
"""Convert a jinja2 template to a JavaScript module.""" """
data = "{}" data = "{}"
if data_path is not None: if data_path is not None:
with data_path.open("r", encoding="utf8") as f: if data_path.suffix in (".yml", ".yaml"):
data = json.dumps(json.loads(f.read())) # dump and load for compactness data = srsly.read_yaml(data_path)
else:
data = srsly.read_json(data_path)
data = srsly.json_dumps(data) # dump and load for compactness
template_path = Path(template_path)
tpl_file = template_path.parts[-1] tpl_file = template_path.parts[-1]
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6") compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
header = f"// This file was auto-generated by {__file__} based on {tpl_file}" header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
@ -1258,4 +1264,10 @@ def main(
if __name__ == "__main__": if __name__ == "__main__":
typer.run(main) args = sys.argv[1:]
if not len(args):
raise ValueError("Need at least one argument: path to .jinja template")
template_path = Path(args[0])
output = Path(args[1]) if len(args) > 1 else None
data_path = Path(args[2]) if len(args) > 2 else None
main(template_path, output, data_path)

View File

@ -1,3 +1,3 @@
# These are used to compile the training quickstart config # These are used to compile the training quickstart config
jinja2 jinja2
typer srsly

View File

@ -1 +1 @@
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js ../../spacy/cli/templates/quickstart_training_recommendations.yml

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@ import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
import { Quickstart } from '../components/quickstart' import { Quickstart } from '../components/quickstart'
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator' import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
import { isString, htmlToReact } from '../components/util' import { htmlToReact } from '../components/util'
const DEFAULT_LANG = 'en' const DEFAULT_LANG = 'en'
const DEFAULT_HARDWARE = 'gpu' const DEFAULT_HARDWARE = 'gpu'
@ -47,13 +47,6 @@ const DATA = [
}, },
] ]
function stringify(value) {
if (isString(value) && value.startsWith('${')) return value
const string = JSON.stringify(value)
if (Array.isArray(value)) return string.replace(/,/g, ', ')
return string
}
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) { export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
const [lang, setLang] = useState(DEFAULT_LANG) const [lang, setLang] = useState(DEFAULT_LANG)
const [components, setComponents] = useState([]) const [components, setComponents] = useState([])
@ -73,6 +66,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
hardware, hardware,
transformer_data: reco.transformer, transformer_data: reco.transformer,
word_vectors: reco.word_vectors, word_vectors: reco.word_vectors,
has_letters: reco.has_letters,
}) })
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n') const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
const rawContent = `${COMMENT}\n${rawStr}` const rawContent = `${COMMENT}\n${rawStr}`
@ -90,7 +84,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
id: code, id: code,
title: name, title: name,
})) }))
.sort((a, b) => a.id.localeCompare(b.id)) .sort((a, b) => a.title.localeCompare(b.title))
return ( return (
<Quickstart <Quickstart
download={download} download={download}