From b544dcb3c523d260c84431ea4fc07330e46b8790 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 12 Sep 2019 15:26:20 +0200 Subject: [PATCH] Document debug-data [ci skip] --- spacy/cli/debug_data.py | 5 + website/docs/api/cli.md | 223 ++++++++++++++++++++++++++++----- website/docs/usage/training.md | 99 ++++++++------- 3 files changed, 252 insertions(+), 75 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 259ef6d94..aac4d5b97 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -47,6 +47,11 @@ def debug_data( verbose=False, no_format=False, ): + """ + Analyze, debug and validate your training and development data, get useful + stats, and find problems like invalid entity annotations, cyclic + dependencies, low data labels and more. + """ msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index d01637925..d13490a27 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -8,6 +8,7 @@ menu: - ['Info', 'info'] - ['Validate', 'validate'] - ['Convert', 'convert'] + - ['Debug data', 'debug-data'] - ['Train', 'train'] - ['Pretrain', 'pretrain'] - ['Init Model', 'init-model'] @@ -174,12 +175,172 @@ All output files generated by this command are compatible with -| ID | Description | -| ------------------------------ | --------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | +| ID | Description | +| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | + +## Debug data {#debug-data new="2.2"} + +Analyze, debug and validate your training and development data, get useful +stats, and find problems like invalid entity annotations, cyclic dependencies, +low data labels and more. + +```bash +$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] +``` + +| Argument | Type | Description | +| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | + + + +``` +=========================== Data format validation =========================== +✔ Corpus is loadable + +=============================== Training stats =============================== +Training pipeline: tagger, parser, ner +Starting with blank model 'en' +18127 training docs +2939 evaluation docs +⚠ 34 training examples also in evaluation data + +============================== Vocab & Vectors ============================== +ℹ 2083156 total words in the data (56962 unique) +⚠ 13020 misaligned tokens in the training data +⚠ 2423 misaligned tokens in the dev data +10 most common words: 'the' (98429), ',' (91756), '.' (87073), 'to' (50058), +'of' (49559), 'and' (44416), 'a' (34010), 'in' (31424), 'that' (22792), 'is' +(18952) +ℹ No word vectors present in the model + +========================== Named Entity Recognition ========================== +ℹ 18 new labels, 0 existing labels +528978 missing values (tokens with '-' label) +New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' +(10490), 'NORP' (9033), 'MONEY' (5164), 'PERCENT' (3761), 'ORDINAL' (2122), +'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' +(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) +✔ Good amount of examples for all labels +✔ Examples without occurences available for all labels +✔ No entities consisting of or starting/ending with whitespace + +=========================== Part-of-speech Tagging =========================== +ℹ 49 labels in data (57 labels in tag map) +'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830), +'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB' +(74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN' +(42193), 'CD' (40326), 'VBG' (34764), 'TO' (31085), 'MD' (25863), 'PRP$' +(23335), 'HYPH' (13833), 'POS' (13427), 'UH' (13322), 'WP' (10423), 'WDT' +(9850), 'RP' (8230), 'WRB' (8201), ':' (8168), '''' (7392), '``' (6984), 'NNPS' +(5817), 'JJR' (5689), '$' (3710), 'EX' (3465), 'JJS' (3118), 'RBR' (2872), +'-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW' +(794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX' +(24) +✔ All labels present in tag map for language 'en' + +============================= Dependency Parsing ============================= +ℹ Found 111703 sentences with an average length of 18.6 words. +ℹ Found 2251 nonprojective train sentences +ℹ Found 303 nonprojective dev sentences +ℹ 47 labels in train data +ℹ 211 labels in projectivized train data +'punct' (236796), 'prep' (188853), 'pobj' (182533), 'det' (172674), 'nsubj' +(169481), 'compound' (116142), 'ROOT' (111697), 'amod' (107945), 'dobj' (93540), +'aux' (86802), 'advmod' (86197), 'cc' (62679), 'conj' (59575), 'poss' (36449), +'ccomp' (36343), 'advcl' (29017), 'mark' (27990), 'nummod' (24582), 'relcl' +(21359), 'xcomp' (21081), 'attr' (18347), 'npadvmod' (17740), 'acomp' (17204), +'auxpass' (15639), 'appos' (15368), 'neg' (15266), 'nsubjpass' (13922), 'case' +(13408), 'acl' (12574), 'pcomp' (10340), 'nmod' (9736), 'intj' (9285), 'prt' +(8196), 'quantmod' (7403), 'dep' (4300), 'dative' (4091), 'agent' (3908), 'expl' +(3456), 'parataxis' (3099), 'oprd' (2326), 'predet' (1946), 'csubj' (1494), +'subtok' (1147), 'preconj' (692), 'meta' (469), 'csubjpass' (64), 'iobj' (1) +⚠ Low number of examples for label 'iobj' (1) +⚠ Low number of examples for 130 labels in the projectivized dependency +trees used for training. You may want to projectivize labels such as punct +before training in order to improve parser performance. +⚠ Projectivized labels with low numbers of examples: appos||attr: 12 +advmod||dobj: 13 prep||ccomp: 12 nsubjpass||ccomp: 15 pcomp||prep: 14 +amod||dobj: 9 attr||xcomp: 14 nmod||nsubj: 17 prep||advcl: 2 prep||prep: 5 +nsubj||conj: 12 advcl||advmod: 18 ccomp||advmod: 11 ccomp||pcomp: 5 acl||pobj: +10 npadvmod||acomp: 7 dobj||pcomp: 14 nsubjpass||pcomp: 1 nmod||pobj: 8 +amod||attr: 6 nmod||dobj: 12 aux||conj: 1 neg||conj: 1 dative||xcomp: 11 +pobj||dative: 3 xcomp||acomp: 19 advcl||pobj: 2 nsubj||advcl: 2 csubj||ccomp: 1 +advcl||acl: 1 relcl||nmod: 2 dobj||advcl: 10 advmod||advcl: 3 nmod||nsubjpass: 6 +amod||pobj: 5 cc||neg: 1 attr||ccomp: 16 advcl||xcomp: 3 nmod||attr: 4 +advcl||nsubjpass: 5 advcl||ccomp: 4 ccomp||conj: 1 punct||acl: 1 meta||acl: 1 +parataxis||acl: 1 prep||acl: 1 amod||nsubj: 7 ccomp||ccomp: 3 acomp||xcomp: 5 +dobj||acl: 5 prep||oprd: 6 advmod||acl: 2 dative||advcl: 1 pobj||agent: 5 +xcomp||amod: 1 dep||advcl: 1 prep||amod: 8 relcl||compound: 1 advcl||csubj: 3 +npadvmod||conj: 2 npadvmod||xcomp: 4 advmod||nsubj: 3 ccomp||amod: 7 +advcl||conj: 1 nmod||conj: 2 advmod||nsubjpass: 2 dep||xcomp: 2 appos||ccomp: 1 +advmod||dep: 1 advmod||advmod: 5 aux||xcomp: 8 dep||advmod: 1 dative||ccomp: 2 +prep||dep: 1 conj||conj: 1 dep||ccomp: 4 cc||ROOT: 1 prep||ROOT: 1 nsubj||pcomp: +3 advmod||prep: 2 relcl||dative: 1 acl||conj: 1 advcl||attr: 4 prep||npadvmod: 1 +nsubjpass||xcomp: 1 neg||advmod: 1 xcomp||oprd: 1 advcl||advcl: 1 dobj||dep: 3 +nsubjpass||parataxis: 1 attr||pcomp: 1 ccomp||parataxis: 1 advmod||attr: 1 +nmod||oprd: 1 appos||nmod: 2 advmod||relcl: 1 appos||npadvmod: 1 appos||conj: 1 +prep||expl: 1 nsubjpass||conj: 1 punct||pobj: 1 cc||pobj: 1 conj||pobj: 1 +punct||conj: 1 ccomp||dep: 1 oprd||xcomp: 3 ccomp||xcomp: 1 ccomp||nsubj: 1 +nmod||dep: 1 xcomp||ccomp: 1 acomp||advcl: 1 intj||advmod: 1 advmod||acomp: 2 +relcl||oprd: 1 advmod||prt: 1 advmod||pobj: 1 appos||nummod: 1 relcl||npadvmod: +3 mark||advcl: 1 aux||ccomp: 1 amod||nsubjpass: 1 npadvmod||advmod: 1 conj||dep: +1 nummod||pobj: 1 amod||npadvmod: 1 intj||pobj: 1 nummod||npadvmod: 1 +xcomp||xcomp: 1 aux||dep: 1 advcl||relcl: 1 +⚠ The following labels were found only in the train data: xcomp||amod, +advcl||relcl, prep||nsubjpass, acl||nsubj, nsubjpass||conj, xcomp||oprd, +advmod||conj, advmod||advmod, iobj, advmod||nsubjpass, dobj||conj, ccomp||amod, +meta||acl, xcomp||xcomp, prep||attr, prep||ccomp, advcl||acomp, acl||dobj, +advcl||advcl, pobj||agent, prep||advcl, nsubjpass||xcomp, prep||dep, +acomp||xcomp, aux||ccomp, ccomp||dep, conj||dep, relcl||compound, +nsubjpass||ccomp, nmod||dobj, advmod||advcl, advmod||acl, dobj||advcl, +dative||xcomp, prep||nsubj, ccomp||ccomp, nsubj||ccomp, xcomp||acomp, +prep||acomp, dep||advmod, acl||pobj, appos||dobj, npadvmod||acomp, cc||ROOT, +relcl||nsubj, nmod||pobj, acl||nsubjpass, ccomp||advmod, pcomp||prep, +amod||dobj, advmod||attr, advcl||csubj, appos||attr, dobj||pcomp, prep||ROOT, +relcl||pobj, advmod||pobj, amod||nsubj, ccomp||xcomp, prep||oprd, +npadvmod||advmod, appos||nummod, advcl||pobj, neg||advmod, acl||attr, +appos||nsubjpass, csubj||ccomp, amod||nsubjpass, intj||pobj, dep||advcl, +cc||neg, xcomp||ccomp, dative||ccomp, nmod||oprd, pobj||dative, prep||dobj, +dep||ccomp, relcl||attr, ccomp||nsubj, advcl||xcomp, nmod||dep, advcl||advmod, +ccomp||conj, pobj||prep, advmod||acomp, advmod||relcl, attr||pcomp, +ccomp||parataxis, oprd||xcomp, intj||advmod, nmod||nsubjpass, prep||npadvmod, +parataxis||acl, prep||pobj, advcl||dobj, amod||pobj, prep||acl, conj||pobj, +advmod||dep, punct||pobj, ccomp||acomp, acomp||advcl, nummod||npadvmod, +dobj||dep, npadvmod||xcomp, advcl||conj, relcl||npadvmod, punct||acl, +relcl||dobj, dobj||xcomp, nsubjpass||parataxis, dative||advcl, relcl||nmod, +advcl||ccomp, appos||npadvmod, ccomp||pcomp, prep||amod, mark||advcl, +prep||advmod, prep||xcomp, appos||nsubj, attr||ccomp, advmod||prt, dobj||ccomp, +aux||conj, advcl||nsubj, conj||conj, advmod||ccomp, advcl||nsubjpass, +attr||xcomp, nmod||conj, npadvmod||conj, relcl||dative, prep||expl, +nsubjpass||pcomp, advmod||xcomp, advmod||dobj, appos||pobj, nsubj||conj, +relcl||nsubjpass, advcl||attr, appos||ccomp, advmod||prep, prep||conj, +nmod||attr, punct||conj, neg||conj, dep||xcomp, aux||xcomp, dobj||acl, +nummod||pobj, amod||npadvmod, nsubj||pcomp, advcl||acl, appos||nmod, +relcl||oprd, prep||prep, cc||pobj, nmod||nsubj, amod||attr, aux||dep, +appos||conj, advmod||nsubj, nsubj||advcl, acl||conj +To train a parser, your data should include at least 20 instances of each label. +⚠ Multiple root labels (ROOT, nsubj, aux, npadvmod, prep) found in +training data. spaCy's parser uses a single root label ROOT so this distinction +will not be available. + +================================== Summary ================================== +✔ 5 checks passed +⚠ 8 warnings +``` + + ## Train {#train} @@ -226,7 +387,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging subtokens. Typically used for languages like Chinese. | +| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging subtokens. Typically used for languages like Chinese. | | `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | @@ -291,26 +452,26 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- | -| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. | -| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | -| `output_dir` | positional | Directory to write models to on each epoch. | -| `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | -| `--embed-rows`, `-er` | option | Number of embedding rows. | -| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | -| `--dropout`, `-d` | option | Dropout rate. | -| `--batch-size`, `-bs` | option | Number of words per training batch. | -| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | -| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. | -| `--seed`, `-s` | option | Seed for random number generators. | -| `--n-iter`, `-i` | option | Number of iterations to pretrain. | -| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | -| `--n-save-every`, `-se` | option | Save model every X batches. | -| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.| -| `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.| -| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | +| Argument | Type | Description | +| ----------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. | +| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | +| `output_dir` | positional | Directory to write models to on each epoch. | +| `--width`, `-cw` | option | Width of CNN layers. | +| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--embed-rows`, `-er` | option | Number of embedding rows. | +| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | +| `--dropout`, `-d` | option | Dropout rate. | +| `--batch-size`, `-bs` | option | Number of words per training batch. | +| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | +| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. | +| `--seed`, `-s` | option | Seed for random number generators. | +| `--n-iter`, `-i` | option | Number of iterations to pretrain. | +| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | +| `--n-save-every`, `-se` | option | Save model every X batches. | +| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | +| `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. | +| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | ### JSONL format for raw text {#pretrain-jsonl} @@ -330,10 +491,10 @@ tokenization can be provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ------- | -------------------------------------------- | +| Key | Type | Description | +| -------- | ------- | ---------------------------------------------------------- | | `text` | unicode | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| `tokens` | list | Optional tokenization, one string per token. | ```json ### Example @@ -424,7 +585,7 @@ pip install dist/en_model-0.0.0.tar.gz | `input_dir` | positional | Path to directory containing model data. | | `output_dir` | positional | Directory to create package folder in. | | `--meta-path`, `-m` 2 | option | Path to `meta.json` file (optional). | -| `--create-meta`, `-c` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. -| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | +| `--create-meta`, `-c` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | +| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index dd5cd8530..e3386a64f 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -10,9 +10,9 @@ menu: --- This guide describes how to train new statistical models for spaCy's -part-of-speech tagger, named entity recognizer, dependency parser, -text classifier and entity linker. Once the model is trained, -you can then [save and load](/usage/saving-loading#models) it. +part-of-speech tagger, named entity recognizer, dependency parser, text +classifier and entity linker. Once the model is trained, you can then +[save and load](/usage/saving-loading#models) it. ## Training basics {#basics} @@ -40,6 +40,19 @@ mkdir models python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json ``` + + +If you're running spaCy v2.2 or above, you can use the +[`debug-data` command](/api/cli#debug-data) to analyze and validate your +training and development data, get useful stats, and find problems like invalid +entity annotations, cyclic dependencies, low data labels and more. + +```bash +$ python -m spacy debug-data en train.json dev.json --verbose +``` + + + You can also use the [`gold.docs_to_json`](/api/goldparse#docs_to_json) helper to convert a list of `Doc` objects to spaCy's JSON training format. @@ -222,11 +235,10 @@ of being dropped. > - [`begin_training()`](/api/language#begin_training): Start the training and > return an optimizer function to update the model's weights. Can take an -> optional function converting the training data to spaCy's training format. -> - [`update()`](/api/language#update): Update the model with the -> training example and gold data. -> - [`to_disk()`](/api/language#to_disk): Save -> the updated model to a directory. +> optional function converting the training data to spaCy's training format. +> - [`update()`](/api/language#update): Update the model with the training +> example and gold data. +> - [`to_disk()`](/api/language#to_disk): Save the updated model to a directory. ```python ### Example training loop @@ -405,19 +417,20 @@ referred to as the "catastrophic forgetting" problem. ## Entity linking {#entity-linker} -To train an entity linking model, you first need to define a knowledge base (KB). +To train an entity linking model, you first need to define a knowledge base +(KB). ### Creating a knowledge base {#kb} -A KB consists of a list of entities with unique identifiers. Each such entity -has an entity vector that will be used to measure similarity with the context in -which an entity is used. These vectors are pretrained and stored in the KB before -the entity linking model will be trained. +A KB consists of a list of entities with unique identifiers. Each such entity +has an entity vector that will be used to measure similarity with the context in +which an entity is used. These vectors are pretrained and stored in the KB +before the entity linking model will be trained. -The following example shows how to build a knowledge base from scratch, -given a list of entities and potential aliases. The script further demonstrates -how to pretrain and store the entity vectors. To run this example, the script -needs access to a `vocab` instance or an `nlp` model with pretrained word embeddings. +The following example shows how to build a knowledge base from scratch, given a +list of entities and potential aliases. The script further demonstrates how to +pretrain and store the entity vectors. To run this example, the script needs +access to a `vocab` instance or an `nlp` model with pretrained word embeddings. ```python https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py @@ -428,22 +441,22 @@ https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py 1. **Load the model** you want to start with, or create an **empty model** using [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and a pre-defined [`vocab`](/api/vocab) object. -2. **Pretrain the entity embeddings** by running the descriptions of the entities - through a simple encoder-decoder network. The current implementation requires - the `nlp` model to have access to pre-trained word embeddings, but a custom - implementation of this enoding step can also be used. +2. **Pretrain the entity embeddings** by running the descriptions of the + entities through a simple encoder-decoder network. The current implementation + requires the `nlp` model to have access to pre-trained word embeddings, but a + custom implementation of this enoding step can also be used. 3. **Construct the KB** by defining all entities with their pretrained vectors, - and all aliases with their prior probabilities. + and all aliases with their prior probabilities. 4. **Save** the KB using [`kb.dump`](/api/kb#dump). 5. **Test** the KB to make sure the entities were added correctly. ### Training an entity linking model {#entity-linker-model} -This example shows how to create an entity linker pipe using a previously created -knowledge base. The entity linker pipe is then trained with your own -examples. To do so, you'll need to provide -**example texts**, and the **character offsets** and **knowledge base identifiers** -of each entity contained in the texts. +This example shows how to create an entity linker pipe using a previously +created knowledge base. The entity linker pipe is then trained with your own +examples. To do so, you'll need to provide **example texts**, and the +**character offsets** and **knowledge base identifiers** of each entity +contained in the texts. ```python https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py @@ -451,25 +464,23 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_li #### Step by step guide {#step-by-step-entity-linker} -1. **Load the KB** you want to start with, and specify the path - to the `Vocab` object that was used to create this KB. - Then, create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. - Don't forget to add the KB to the entity linker, - and to add the entity linker to the pipeline. - In practical applications, you will want a more advanced pipeline including - also a component for [named entity recognition](/usage/training#ner). - If you're using a model with additional components, make sure to disable all other - pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be - training the entity linker. +1. **Load the KB** you want to start with, and specify the path to the `Vocab` + object that was used to create this KB. Then, create an **empty model** using + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. + Don't forget to add the KB to the entity linker, and to add the entity linker + to the pipeline. In practical applications, you will want a more advanced + pipeline including also a component for + [named entity recognition](/usage/training#ner). If you're using a model with + additional components, make sure to disable all other pipeline components + during training using [`nlp.disable_pipes`](/api/language#disable_pipes). + This way, you'll only be training the entity linker. 2. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through - the annotated examples of the input. For each combination of a mention in text and - a potential KB identifier, the model makes a **prediction** whether or not - this is the correct match. It then - consults the annotations to see whether it was right. If it was wrong, it - adjusts its weights so that the correct combination will score higher next time. + the annotated examples of the input. For each combination of a mention in + text and a potential KB identifier, the model makes a **prediction** whether + or not this is the correct match. It then consults the annotations to see + whether it was right. If it was wrong, it adjusts its weights so that the + correct combination will score higher next time. 3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). 4. **Test** the model to make sure the entities in the training data are recognized correctly.