From 20b0ec5dcf5b97a3c406ec6bd7aa3f32223c63fa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 10:37:12 +0200 Subject: [PATCH 1/9] avoid logging performance of frozen components --- spacy/cli/train.py | 6 ++++-- spacy/training/loggers.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index bf3749c9e..811a3ba86 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -152,7 +152,8 @@ def train( exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row, finalize_logger = train_logger(nlp) + with nlp.select_pipes(disable=[*frozen_components]): + print_row, finalize_logger = train_logger(nlp) try: progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) @@ -163,7 +164,8 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - update_meta(T_cfg, nlp, info) + with nlp.select_pipes(disable=[*frozen_components]): + update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 92b598033..dddf20169 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -11,9 +11,11 @@ def console_logger(): def setup_printer( nlp: "Language", ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + # we assume here that only components are enabled that should be trained & logged + logged_pipes = nlp.pipe_names score_cols = list(nlp.config["training"]["score_weights"]) score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] + loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] @@ -26,7 +28,7 @@ def console_logger(): try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in nlp.pipe_names + for pipe_name in logged_pipes ] except KeyError as e: raise KeyError( From 6435458d517e1ca689d2bcf6f996df59218957bf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 12:12:38 +0200 Subject: [PATCH 2/9] simplify expression --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 811a3ba86..2900ef379 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -152,7 +152,7 @@ def train( exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - with nlp.select_pipes(disable=[*frozen_components]): + with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) try: @@ -164,7 +164,7 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=[*frozen_components]): + with nlp.select_pipes(disable=frozen_components): update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): nlp.to_disk(output_path / "model-best") From 02b69dd0d532fb4c8835868332268e2f6eead511 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 12:56:54 +0200 Subject: [PATCH 3/9] Update models directory [ci skip] --- website/src/templates/models.js | 108 +++++++++++++------------------- 1 file changed, 44 insertions(+), 64 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 5061972b8..5d705048b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -12,7 +12,6 @@ import Tag from '../components/tag' import { H2, Label } from '../components/typography' import Icon from '../components/icon' import Link from '../components/link' -import Grid from '../components/grid' import Infobox from '../components/infobox' import Accordion from '../components/accordion' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' @@ -31,10 +30,16 @@ const MODEL_META = { wiki: 'Wikipedia', uas: 'Unlabelled dependencies', las: 'Labelled dependencies', + token_acc: 'Tokenization', + tok: 'Tokenization', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', - ents_f: 'Entities (F-score)', - ents_p: 'Entities (precision)', - ents_r: 'Entities (recall)', + tag: 'Part-of-speech tags (fine grained tags, Token.tag)', + ents_f: 'Named entities (F-score)', + ents_p: 'Named entities (precision)', + ents_r: 'Named entities (recall)', + sent_f: 'Sentence segmentation (F-score)', + sent_p: 'Sentence segmentation (precision)', + sent_r: 'Sentence segmentation (recall)', cpu: 'words per second on CPU', gpu: 'words per second on GPU', pipeline: 'Active processing pipeline components in order', @@ -83,25 +88,19 @@ function formatVectors(data) { } function formatAccuracy(data) { - if (!data) return null - const labels = { - las: 'LAS', - uas: 'UAS', - tags_acc: 'TAG', - ents_f: 'NER F', - ents_p: 'NER P', - ents_r: 'NER R', - } - const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) - const isNer = key => key.startsWith('ents_') + if (!data) return [] return Object.keys(data) - .filter(key => labels[key]) - .map(key => ({ - label: labels[key], - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + .map(label => { + const value = data[label] + return isNaN(value) + ? null + : { + label, + value: value.toFixed(2), + help: MODEL_META[label], + } + }) + .filter(item => item) } function formatModelMeta(data) { @@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl { label: 'Author', content: author }, { label: 'License', content: license }, ] - const accuracy = [ - { - label: 'Syntax Accuracy', - items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null, - }, - { - label: 'NER Accuracy', - items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null, - }, - ] const error = ( @@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl

) - return (

- - {accuracy && - accuracy.map(({ label, items }, i) => - !items ? null : ( - - - - - - - - {items.map((item, i) => ( - - - - - ))} - -
{label}
- - {item.value}
- ) - )} -
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} {hasInteractiveCode && ( @@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl `import spacy`, `from spacy.lang.${langId}.examples import sentences `, ``, - `nlp = spacy.load('${name}')`, + `nlp = spacy.load("${name}")`, `doc = nlp(sentences[0])`, `print(doc.text)`, `for token in doc:`, @@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl ].join('\n')} )} + {meta.accuracy && ( + + + + {meta.accuracy.map(({ label, value, help }) => ( + + + + + + ))} + +
+ {label.toUpperCase()} + {help} + {value} +
+
+ )} {labels && (

@@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl const labelNames = labels[pipe] || [] const help = LABEL_SCHEME_META[pipe] return ( - +

- + ) } From 7745d77a38a131f6ffec9b4ae43da8ef799c228e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 13:21:42 +0200 Subject: [PATCH 5/9] Fix whitespace in template [ci skip] --- spacy/cli/templates/quickstart_training.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 7241c5116..53fd99ee8 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -280,7 +280,7 @@ vectors = "{{ word_vectors }}" {% endif -%} {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} -{% endif %} +{% endif -%} dev_corpus = "corpora.dev" train_corpus = "corpora.train" From e4e7f5b00d46b0a6f75e419c509fbd0c73927121 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 15:44:40 +0200 Subject: [PATCH 6/9] Update docs [ci skip] --- website/docs/usage/_benchmarks-models.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 33163f306..028746db0 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison. | System | POS | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | ---: | -| spaCy RoBERTa (2020) | | | | +| spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 | | spaCy CNN (2020) | | | | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | @@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison. **Accuracy on the Penn Treebank.** See [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more -results. +results. For spaCy's evaluation, see the +[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank). From 76bbed3466519d384834715f48f240140c43e02e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 16:00:03 +0200 Subject: [PATCH 7/9] Use Literal type for nr_feature_tokens --- requirements.txt | 1 + setup.cfg | 1 + spacy/compat.py | 5 +++++ spacy/ml/models/parser.py | 3 ++- spacy/tests/serialize/test_serialize_config.py | 14 ++++++++++++-- 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4d6c1dfd0..a8b237aa1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ pytokenizations setuptools packaging importlib_metadata>=0.20; python_version < "3.8" +typing_extensions>=3.7.4; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index dd0975800..9831402d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = setuptools packaging importlib_metadata>=0.20; python_version < "3.8" + typing_extensions>=3.7.4; python_version < "3.8" [options.entry_points] console_scripts = diff --git a/spacy/compat.py b/spacy/compat.py index 2d51ff0ae..6eca18b80 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -22,6 +22,11 @@ try: except ImportError: cupy = None +try: # Python 3.8+ + from typing import Literal +except ImportError: + from typing_extensions import Literal # noqa: F401 + from thinc.api import Optimizer # noqa: F401 pickle = pickle diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 868f9d6d2..68cc20e9b 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,6 +2,7 @@ from typing import Optional, List from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.types import Floats2d +from ...compat import Literal from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel @@ -11,7 +12,7 @@ from ...tokens import Doc @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], - nr_feature_tokens: int, + nr_feature_tokens: Literal[3, 6, 8, 13], hidden_width: int, maxout_pieces: int, use_upper: bool = True, diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1e17b3212..5f25cbfe1 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width} parser_config_string = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 99 +nr_feature_tokens = 3 hidden_width = 66 maxout_pieces = 2 @@ -95,7 +95,7 @@ def my_parser(): MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) parser = build_tb_parser_model( - tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 + tok2vec=tok2vec, nr_feature_tokens=8, hidden_width=65, maxout_pieces=5 ) return parser @@ -340,3 +340,13 @@ def test_config_auto_fill_extra_fields(): assert "extra" not in nlp.config["training"] # Make sure the config generated is valid load_model_from_config(nlp.config) + + +def test_config_validate_literal(): + nlp = English() + config = Config().from_str(parser_config_string) + config["model"]["nr_feature_tokens"] = 666 + with pytest.raises(ConfigValidationError): + nlp.add_pipe("parser", config=config) + config["model"]["nr_feature_tokens"] = 13 + nlp.add_pipe("parser", config=config) From 50a4425cdaed350653368c9c350f95717e9414d9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 16:03:32 +0200 Subject: [PATCH 8/9] Adjust docs --- spacy/ml/models/parser.py | 4 ++-- website/docs/api/architectures.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 68cc20e9b..5d091c590 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -42,8 +42,8 @@ def build_tb_parser_model( tok2vec (Model[List[Doc], List[Floats2d]]): Subnetwork to map tokens into vector representations. nr_feature_tokens (int): The number of tokens in the context to use to - construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The - 2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 + construct the state vector. Valid choices are 3, 6, 8 and 13. The + 8 and 13 feature sets are designed for the parser, while the 3 and 6 feature sets are designed for the NER. The recommended feature sets are 3 for NER, and 8 for the dependency parser. diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 30d863b17..8797b2f31 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -449,7 +449,7 @@ consists of either two or three subnetworks: | Name | Description | | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | +| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `3`, `6`, `8` and `13`. The `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | | `hidden_width` | The width of the hidden layer. ~~int~~ | | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | | `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | From 3c3863654e2804223a30c8ed3cae3d2e73147ca6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 16:54:43 +0200 Subject: [PATCH 9/9] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b57bbeda2..b0cdd562c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a21" +__version__ = "3.0.0a22" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"