mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Update init config and recommendations
- As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies
This commit is contained in:
parent
225f8866a1
commit
e2f2ef3a5a
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -20,6 +20,7 @@ website/logs
|
|||
npm-debug.log*
|
||||
website/www/
|
||||
website/_deploy.sh
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
|
|
|
@ -3,17 +3,17 @@ from enum import Enum
|
|||
from pathlib import Path
|
||||
from wasabi import Printer, diff_strings
|
||||
from thinc.api import Config
|
||||
from pydantic import BaseModel
|
||||
import srsly
|
||||
import re
|
||||
|
||||
from .. import util
|
||||
from ..schemas import RecommendationSchema
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
|
||||
|
||||
TEMPLATE_ROOT = Path(__file__).parent / "templates"
|
||||
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
|
||||
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
|
||||
ROOT = Path(__file__).parent / "templates"
|
||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
|
||||
|
||||
|
||||
class Optimizations(str, Enum):
|
||||
|
@ -21,21 +21,6 @@ class Optimizations(str, Enum):
|
|||
accuracy = "accuracy"
|
||||
|
||||
|
||||
class RecommendationsTrfItem(BaseModel):
|
||||
name: str
|
||||
size_factor: int
|
||||
|
||||
|
||||
class RecommendationsTrf(BaseModel):
|
||||
efficiency: RecommendationsTrfItem
|
||||
accuracy: RecommendationsTrfItem
|
||||
|
||||
|
||||
class RecommendationSchema(BaseModel):
|
||||
word_vectors: Optional[str] = None
|
||||
transformer: Optional[RecommendationsTrf] = None
|
||||
|
||||
|
||||
@init_cli.command("config")
|
||||
def init_config_cli(
|
||||
# fmt: off
|
||||
|
@ -111,14 +96,11 @@ def init_config(
|
|||
from jinja2 import Template
|
||||
except ImportError:
|
||||
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
||||
lang_defaults = util.get_lang_class(lang).Defaults
|
||||
has_letters = lang_defaults.writing_system.get("has_letters", True)
|
||||
# Filter out duplicates since tok2vec and transformer are added by template
|
||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
|
||||
with TEMPLATE_PATH.open("r") as f:
|
||||
template = Template(f.read())
|
||||
# Filter out duplicates since tok2vec and transformer are added by template
|
||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
|
||||
variables = {
|
||||
"lang": lang,
|
||||
"components": pipeline,
|
||||
|
@ -126,7 +108,7 @@ def init_config(
|
|||
"hardware": "cpu" if cpu else "gpu",
|
||||
"transformer_data": reco["transformer"],
|
||||
"word_vectors": reco["word_vectors"],
|
||||
"has_letters": has_letters,
|
||||
"has_letters": reco["has_letters"],
|
||||
}
|
||||
base_template = template.render(variables).strip()
|
||||
# Giving up on getting the newlines right in jinja for now
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
{
|
||||
"en": {
|
||||
"word_vectors": "en_vectors_web_lg",
|
||||
"transformer": {
|
||||
"efficiency": { "name": "roberta-base", "size_factor": 3 },
|
||||
"accuracy": { "name": "roberta-base", "size_factor": 3 }
|
||||
}
|
||||
},
|
||||
"de": {
|
||||
"word_vectors": null,
|
||||
"transformer": null
|
||||
}
|
||||
}
|
103
spacy/cli/templates/quickstart_training_recommendations.yml
Normal file
103
spacy/cli/templates/quickstart_training_recommendations.yml
Normal file
|
@ -0,0 +1,103 @@
|
|||
# Recommended settings and available resources for each language, if available.
|
||||
# Not all languages have recommended word vecotrs or transformers and for some,
|
||||
# the recommended transformer for efficiency and accuracy may be the same.
|
||||
en:
|
||||
word_vectors: en_vectors_web_lg
|
||||
transformer:
|
||||
efficiency:
|
||||
name: roberta-base
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: roberta-base
|
||||
size_factor: 3
|
||||
de:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: bert-base-german-cased
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: bert-base-german-cased
|
||||
size_factor: 3
|
||||
fr:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: camembert-base
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: camembert-base
|
||||
size_factor: 3
|
||||
es:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: mrm8488/RuPERTa-base
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: mrm8488/RuPERTa-base
|
||||
size_factor: 3
|
||||
sv:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: KB/bert-base-swedish-cased
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: KB/bert-base-swedish-cased
|
||||
size_factor: 3
|
||||
fi:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: TurkuNLP/bert-base-finnish-cased-v1
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: TurkuNLP/bert-base-finnish-cased-v1
|
||||
size_factor: 3
|
||||
el:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: nlpaueb/bert-base-greek-uncased-v1
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: nlpaueb/bert-base-greek-uncased-v1
|
||||
size_factor: 3
|
||||
tr:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: dbmdz/bert-base-turkish-cased
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: dbmdz/bert-base-turkish-cased
|
||||
size_factor: 3
|
||||
zh:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: bert-base-chinese
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: bert-base-chinese
|
||||
size_factor: 3
|
||||
has_letters: false
|
||||
ar:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: asafaya/bert-base-arabic
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: asafaya/bert-base-arabic
|
||||
size_factor: 3
|
||||
pl:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: dkleczek/bert-base-polish-cased-v1
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: dkleczek/bert-base-polish-cased-v1
|
||||
size_factor: 3
|
|
@ -311,3 +311,22 @@ class ProjectConfigSchema(BaseModel):
|
|||
|
||||
class Config:
|
||||
title = "Schema for project configuration file"
|
||||
|
||||
|
||||
# Recommendations for init config workflows
|
||||
|
||||
|
||||
class RecommendationTrfItem(BaseModel):
|
||||
name: str
|
||||
size_factor: int
|
||||
|
||||
|
||||
class RecommendationTrf(BaseModel):
|
||||
efficiency: RecommendationTrfItem
|
||||
accuracy: RecommendationTrfItem
|
||||
|
||||
|
||||
class RecommendationSchema(BaseModel):
|
||||
word_vectors: Optional[str] = None
|
||||
transformer: Optional[RecommendationTrf] = None
|
||||
has_letters: bool = True
|
||||
|
|
|
@ -2,10 +2,9 @@ import pytest
|
|||
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
||||
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.schemas import ProjectConfigSchema, validate
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.cli.pretrain import make_docs
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
|
||||
from spacy.cli.init_config import RecommendationSchema
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.util import get_lang_class
|
||||
import srsly
|
||||
|
@ -335,7 +334,5 @@ def test_init_config(lang, pipeline, optimize):
|
|||
|
||||
|
||||
def test_model_recommendations():
|
||||
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
||||
for lang, data in recommendations.items():
|
||||
assert get_lang_class(lang)
|
||||
for lang, data in RECOMMENDATIONS.items():
|
||||
assert RecommendationSchema(**data)
|
||||
|
|
|
@ -53,7 +53,7 @@
|
|||
"remark-react": "^5.0.1"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "npm run python:setup && gatsby build",
|
||||
"build": "npm run python:install && npm run python:setup && gatsby build",
|
||||
"dev": "npm run python:setup && gatsby develop",
|
||||
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
||||
"lint": "eslint **",
|
||||
|
|
|
@ -11,7 +11,8 @@ from os import path
|
|||
from io import StringIO
|
||||
from jinja2 import Environment, FileSystemLoader, nodes
|
||||
from pathlib import Path
|
||||
import typer
|
||||
import srsly
|
||||
import sys
|
||||
|
||||
|
||||
OPERANDS = {
|
||||
|
@ -437,7 +438,8 @@ class JinjaToJS(object):
|
|||
with self._interpolation():
|
||||
with self._python_bool_wrapper(**kwargs):
|
||||
if node.items:
|
||||
raise ValueError(f"Can't process non-empty dict in epxression: {node}")
|
||||
err = f"Can't process non-empty dict in expression: {node}"
|
||||
raise ValueError(err)
|
||||
self.output.write("{}")
|
||||
|
||||
def _process_getattr(self, node, **kwargs):
|
||||
|
@ -1232,18 +1234,22 @@ class JinjaToJS(object):
|
|||
self.output.write(")")
|
||||
|
||||
|
||||
def main(
|
||||
# fmt: off
|
||||
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
|
||||
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"),
|
||||
data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA")
|
||||
# fmt: on
|
||||
):
|
||||
"""Convert a jinja2 template to a JavaScript module."""
|
||||
def main(template_path, output=None, data_path=None):
|
||||
"""Convert a jinja2 template to a JavaScript module.
|
||||
|
||||
template_path (Path): Path to .jijna file.
|
||||
output (Optional[Path]): Path to output .js module (stdout if unset).
|
||||
data_path (Optional[Path]): Optional JSON or YAML file with additional data
|
||||
to be included in the JS module as the exported variable DATA.
|
||||
"""
|
||||
data = "{}"
|
||||
if data_path is not None:
|
||||
with data_path.open("r", encoding="utf8") as f:
|
||||
data = json.dumps(json.loads(f.read())) # dump and load for compactness
|
||||
if data_path.suffix in (".yml", ".yaml"):
|
||||
data = srsly.read_yaml(data_path)
|
||||
else:
|
||||
data = srsly.read_json(data_path)
|
||||
data = srsly.json_dumps(data) # dump and load for compactness
|
||||
template_path = Path(template_path)
|
||||
tpl_file = template_path.parts[-1]
|
||||
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
|
||||
header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
|
||||
|
@ -1258,4 +1264,10 @@ def main(
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
args = sys.argv[1:]
|
||||
if not len(args):
|
||||
raise ValueError("Need at least one argument: path to .jinja template")
|
||||
template_path = Path(args[0])
|
||||
output = Path(args[1]) if len(args) > 1 else None
|
||||
data_path = Path(args[2]) if len(args) > 2 else None
|
||||
main(template_path, output, data_path)
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
# These are used to compile the training quickstart config
|
||||
jinja2
|
||||
typer
|
||||
srsly
|
||||
|
|
|
@ -1 +1 @@
|
|||
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json
|
||||
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js ../../spacy/cli/templates/quickstart_training_recommendations.yml
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -4,7 +4,7 @@ import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
|||
|
||||
import { Quickstart } from '../components/quickstart'
|
||||
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
|
||||
import { isString, htmlToReact } from '../components/util'
|
||||
import { htmlToReact } from '../components/util'
|
||||
|
||||
const DEFAULT_LANG = 'en'
|
||||
const DEFAULT_HARDWARE = 'gpu'
|
||||
|
@ -47,13 +47,6 @@ const DATA = [
|
|||
},
|
||||
]
|
||||
|
||||
function stringify(value) {
|
||||
if (isString(value) && value.startsWith('${')) return value
|
||||
const string = JSON.stringify(value)
|
||||
if (Array.isArray(value)) return string.replace(/,/g, ', ')
|
||||
return string
|
||||
}
|
||||
|
||||
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
|
||||
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||
const [components, setComponents] = useState([])
|
||||
|
@ -73,6 +66,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
|||
hardware,
|
||||
transformer_data: reco.transformer,
|
||||
word_vectors: reco.word_vectors,
|
||||
has_letters: reco.has_letters,
|
||||
})
|
||||
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
|
||||
const rawContent = `${COMMENT}\n${rawStr}`
|
||||
|
@ -90,7 +84,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
|||
id: code,
|
||||
title: name,
|
||||
}))
|
||||
.sort((a, b) => a.id.localeCompare(b.id))
|
||||
.sort((a, b) => a.title.localeCompare(b.title))
|
||||
return (
|
||||
<Quickstart
|
||||
download={download}
|
||||
|
|
Loading…
Reference in New Issue
Block a user