mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Various updates/additions to CLI scripts (#5362)
* `debug-data`: determine coverage of provided vectors
* `evaluate`: support `blank:lg` model to make it possible to just evaluate
tokenization
* `init-model`: add option to truncate vectors to N most frequent vectors
from word2vec file
* `train`:
  * if training on GPU, only run evaluation/timing on CPU in the first
    iteration
  * if training is aborted, exit with a non-0 exit status
			
			
This commit is contained in:
		
							parent
							
								
									cfdaf99b80
								
							
						
					
					
						commit
						bdff76dede
					
				| 
						 | 
					@ -108,9 +108,11 @@ def debug_data(
 | 
				
			||||||
    msg.good("Corpus is loadable")
 | 
					    msg.good("Corpus is loadable")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create all gold data here to avoid iterating over the train_docs constantly
 | 
					    # Create all gold data here to avoid iterating over the train_docs constantly
 | 
				
			||||||
    gold_train_data = _compile_gold(train_docs, pipeline)
 | 
					    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
 | 
				
			||||||
    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
 | 
					    gold_train_unpreprocessed_data = _compile_gold(
 | 
				
			||||||
    gold_dev_data = _compile_gold(dev_docs, pipeline)
 | 
					        train_docs_unpreprocessed, pipeline, nlp
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_texts = gold_train_data["texts"]
 | 
					    train_texts = gold_train_data["texts"]
 | 
				
			||||||
    dev_texts = gold_dev_data["texts"]
 | 
					    dev_texts = gold_dev_data["texts"]
 | 
				
			||||||
| 
						 | 
					@ -182,6 +184,16 @@ def debug_data(
 | 
				
			||||||
                nlp.vocab.vectors_length,
 | 
					                nlp.vocab.vectors_length,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "{} words in training data without vectors ({:0.2f}%)".format(
 | 
				
			||||||
 | 
					                n_missing_vectors,
 | 
				
			||||||
 | 
					                n_missing_vectors / gold_train_data["n_words"],
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        msg.text(
 | 
				
			||||||
 | 
					            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("No word vectors present in the model")
 | 
					        msg.info("No word vectors present in the model")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -562,7 +574,7 @@ def _load_file(file_path, msg):
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _compile_gold(train_docs, pipeline):
 | 
					def _compile_gold(train_docs, pipeline, nlp):
 | 
				
			||||||
    data = {
 | 
					    data = {
 | 
				
			||||||
        "ner": Counter(),
 | 
					        "ner": Counter(),
 | 
				
			||||||
        "cats": Counter(),
 | 
					        "cats": Counter(),
 | 
				
			||||||
| 
						 | 
					@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
 | 
				
			||||||
        "punct_ents": 0,
 | 
					        "punct_ents": 0,
 | 
				
			||||||
        "n_words": 0,
 | 
					        "n_words": 0,
 | 
				
			||||||
        "n_misaligned_words": 0,
 | 
					        "n_misaligned_words": 0,
 | 
				
			||||||
 | 
					        "words_missing_vectors": Counter(),
 | 
				
			||||||
        "n_sents": 0,
 | 
					        "n_sents": 0,
 | 
				
			||||||
        "n_nonproj": 0,
 | 
					        "n_nonproj": 0,
 | 
				
			||||||
        "n_cycles": 0,
 | 
					        "n_cycles": 0,
 | 
				
			||||||
| 
						 | 
					@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
 | 
				
			||||||
        data["n_words"] += len(valid_words)
 | 
					        data["n_words"] += len(valid_words)
 | 
				
			||||||
        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
 | 
					        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
 | 
				
			||||||
        data["texts"].add(doc.text)
 | 
					        data["texts"].add(doc.text)
 | 
				
			||||||
 | 
					        if len(nlp.vocab.vectors):
 | 
				
			||||||
 | 
					            for word in valid_words:
 | 
				
			||||||
 | 
					                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
 | 
				
			||||||
 | 
					                    data["words_missing_vectors"].update([word])
 | 
				
			||||||
        if "ner" in pipeline:
 | 
					        if "ner" in pipeline:
 | 
				
			||||||
            for i, label in enumerate(gold.ner):
 | 
					            for i, label in enumerate(gold.ner):
 | 
				
			||||||
                if label is None:
 | 
					                if label is None:
 | 
				
			||||||
| 
						 | 
					@ -636,7 +653,11 @@ def _format_labels(labels, counts=False):
 | 
				
			||||||
def _get_examples_without_label(data, label):
 | 
					def _get_examples_without_label(data, label):
 | 
				
			||||||
    count = 0
 | 
					    count = 0
 | 
				
			||||||
    for doc, gold in data:
 | 
					    for doc, gold in data:
 | 
				
			||||||
        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
 | 
					        labels = [
 | 
				
			||||||
 | 
					            label.split("-")[1]
 | 
				
			||||||
 | 
					            for label in gold.ner
 | 
				
			||||||
 | 
					            if label is not None and label not in ("O", "-")
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
        if label not in labels:
 | 
					        if label not in labels:
 | 
				
			||||||
            count += 1
 | 
					            count += 1
 | 
				
			||||||
    return count
 | 
					    return count
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,7 @@
 | 
				
			||||||
from __future__ import unicode_literals, division, print_function
 | 
					from __future__ import unicode_literals, division, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,7 +44,10 @@ def evaluate(
 | 
				
			||||||
    if displacy_path and not displacy_path.exists():
 | 
					    if displacy_path and not displacy_path.exists():
 | 
				
			||||||
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
					        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
				
			||||||
    corpus = GoldCorpus(data_path, data_path)
 | 
					    corpus = GoldCorpus(data_path, data_path)
 | 
				
			||||||
    nlp = util.load_model(model)
 | 
					    if model.startswith("blank:"):
 | 
				
			||||||
 | 
					        nlp = spacy.blank(model.replace("blank:", ""))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        nlp = util.load_model(model)
 | 
				
			||||||
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
 | 
					    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
 | 
				
			||||||
    begin = timer()
 | 
					    begin = timer()
 | 
				
			||||||
    scorer = nlp.evaluate(dev_docs, verbose=False)
 | 
					    scorer = nlp.evaluate(dev_docs, verbose=False)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20
 | 
				
			||||||
    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
 | 
					    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
 | 
				
			||||||
    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
 | 
					    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
 | 
				
			||||||
    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
 | 
					    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
 | 
				
			||||||
 | 
					    truncate_vectors=(
 | 
				
			||||||
 | 
					        "Optional number of vectors to truncate to when reading in vectors file",
 | 
				
			||||||
 | 
					        "option",
 | 
				
			||||||
 | 
					        "t",
 | 
				
			||||||
 | 
					        int,
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
 | 
					    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
 | 
				
			||||||
    vectors_name=(
 | 
					    vectors_name=(
 | 
				
			||||||
        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
 | 
					        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
 | 
				
			||||||
| 
						 | 
					@ -51,6 +57,7 @@ def init_model(
 | 
				
			||||||
    clusters_loc=None,
 | 
					    clusters_loc=None,
 | 
				
			||||||
    jsonl_loc=None,
 | 
					    jsonl_loc=None,
 | 
				
			||||||
    vectors_loc=None,
 | 
					    vectors_loc=None,
 | 
				
			||||||
 | 
					    truncate_vectors=0,
 | 
				
			||||||
    prune_vectors=-1,
 | 
					    prune_vectors=-1,
 | 
				
			||||||
    vectors_name=None,
 | 
					    vectors_name=None,
 | 
				
			||||||
    model_name=None,
 | 
					    model_name=None,
 | 
				
			||||||
| 
						 | 
					@ -88,7 +95,7 @@ def init_model(
 | 
				
			||||||
        nlp = create_model(lang, lex_attrs, name=model_name)
 | 
					        nlp = create_model(lang, lex_attrs, name=model_name)
 | 
				
			||||||
    msg.good("Successfully created model")
 | 
					    msg.good("Successfully created model")
 | 
				
			||||||
    if vectors_loc is not None:
 | 
					    if vectors_loc is not None:
 | 
				
			||||||
        add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
 | 
					        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
 | 
				
			||||||
    vec_added = len(nlp.vocab.vectors)
 | 
					    vec_added = len(nlp.vocab.vectors)
 | 
				
			||||||
    lex_added = len(nlp.vocab)
 | 
					    lex_added = len(nlp.vocab)
 | 
				
			||||||
    msg.good(
 | 
					    msg.good(
 | 
				
			||||||
| 
						 | 
					@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
 | 
				
			||||||
    return nlp
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
 | 
					def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
 | 
				
			||||||
    vectors_loc = ensure_path(vectors_loc)
 | 
					    vectors_loc = ensure_path(vectors_loc)
 | 
				
			||||||
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
 | 
					    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
 | 
				
			||||||
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
 | 
					        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
 | 
				
			||||||
| 
						 | 
					@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        if vectors_loc:
 | 
					        if vectors_loc:
 | 
				
			||||||
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
 | 
					            with msg.loading("Reading vectors from {}".format(vectors_loc)):
 | 
				
			||||||
                vectors_data, vector_keys = read_vectors(vectors_loc)
 | 
					                vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
 | 
				
			||||||
            msg.good("Loaded vectors from {}".format(vectors_loc))
 | 
					            msg.good("Loaded vectors from {}".format(vectors_loc))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            vectors_data, vector_keys = (None, None)
 | 
					            vectors_data, vector_keys = (None, None)
 | 
				
			||||||
| 
						 | 
					@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
 | 
				
			||||||
        nlp.vocab.prune_vectors(prune_vectors)
 | 
					        nlp.vocab.prune_vectors(prune_vectors)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_vectors(vectors_loc):
 | 
					def read_vectors(vectors_loc, truncate_vectors=0):
 | 
				
			||||||
    f = open_file(vectors_loc)
 | 
					    f = open_file(vectors_loc)
 | 
				
			||||||
    shape = tuple(int(size) for size in next(f).split())
 | 
					    shape = tuple(int(size) for size in next(f).split())
 | 
				
			||||||
 | 
					    if truncate_vectors >= 1:
 | 
				
			||||||
 | 
					        shape = (truncate_vectors, shape[1])
 | 
				
			||||||
    vectors_data = numpy.zeros(shape=shape, dtype="f")
 | 
					    vectors_data = numpy.zeros(shape=shape, dtype="f")
 | 
				
			||||||
    vectors_keys = []
 | 
					    vectors_keys = []
 | 
				
			||||||
    for i, line in enumerate(tqdm(f)):
 | 
					    for i, line in enumerate(tqdm(f)):
 | 
				
			||||||
| 
						 | 
					@ -212,6 +221,8 @@ def read_vectors(vectors_loc):
 | 
				
			||||||
            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
 | 
					            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
 | 
				
			||||||
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
 | 
					        vectors_data[i] = numpy.asarray(pieces, dtype="f")
 | 
				
			||||||
        vectors_keys.append(word)
 | 
					        vectors_keys.append(word)
 | 
				
			||||||
 | 
					        if i == truncate_vectors - 1:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
    return vectors_data, vectors_keys
 | 
					    return vectors_data, vectors_keys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -454,22 +454,25 @@ def train(
 | 
				
			||||||
                        cpu_wps = nwords / (end_time - start_time)
 | 
					                        cpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                    else:
 | 
					                    else:
 | 
				
			||||||
                        gpu_wps = nwords / (end_time - start_time)
 | 
					                        gpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                        with Model.use_device("cpu"):
 | 
					                        # Only evaluate on CPU in the first iteration (for
 | 
				
			||||||
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
					                        # timing) if GPU is enabled
 | 
				
			||||||
                            for name, component in nlp_loaded.pipeline:
 | 
					                        if i >= 1:
 | 
				
			||||||
                                if hasattr(component, "cfg"):
 | 
					                            with Model.use_device("cpu"):
 | 
				
			||||||
                                    component.cfg["beam_width"] = beam_width
 | 
					                                nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
				
			||||||
                            dev_docs = list(
 | 
					                                for name, component in nlp_loaded.pipeline:
 | 
				
			||||||
                                corpus.dev_docs(
 | 
					                                    if hasattr(component, "cfg"):
 | 
				
			||||||
                                    nlp_loaded,
 | 
					                                        component.cfg["beam_width"] = beam_width
 | 
				
			||||||
                                    gold_preproc=gold_preproc,
 | 
					                                dev_docs = list(
 | 
				
			||||||
                                    ignore_misaligned=True,
 | 
					                                    corpus.dev_docs(
 | 
				
			||||||
 | 
					                                        nlp_loaded,
 | 
				
			||||||
 | 
					                                        gold_preproc=gold_preproc,
 | 
				
			||||||
 | 
					                                        ignore_misaligned=True,
 | 
				
			||||||
 | 
					                                    )
 | 
				
			||||||
                                )
 | 
					                                )
 | 
				
			||||||
                            )
 | 
					                                start_time = timer()
 | 
				
			||||||
                            start_time = timer()
 | 
					                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
 | 
				
			||||||
                            scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
 | 
					                                end_time = timer()
 | 
				
			||||||
                            end_time = timer()
 | 
					                                cpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                            cpu_wps = nwords / (end_time - start_time)
 | 
					 | 
				
			||||||
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
 | 
					                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
 | 
				
			||||||
                    srsly.write_json(acc_loc, scorer.scores)
 | 
					                    srsly.write_json(acc_loc, scorer.scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -550,7 +553,8 @@ def train(
 | 
				
			||||||
    except Exception as e:
 | 
					    except Exception as e:
 | 
				
			||||||
        msg.warn(
 | 
					        msg.warn(
 | 
				
			||||||
            "Aborting and saving the final best model. "
 | 
					            "Aborting and saving the final best model. "
 | 
				
			||||||
            "Encountered exception: {}".format(e)
 | 
					            "Encountered exception: {}".format(e),
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    finally:
 | 
					    finally:
 | 
				
			||||||
        best_pipes = nlp.pipe_names
 | 
					        best_pipes = nlp.pipe_names
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 | 
				
			||||||
| `output_dir`            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | 
					| `output_dir`            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | 
				
			||||||
| `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                           |
 | 
					| `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                           |
 | 
				
			||||||
| `--vectors-loc`, `-v`   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | 
					| `--vectors-loc`, `-v`   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | 
				
			||||||
| `--prune-vectors`, `-V` | flag       | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | 
					| `--truncate-vectors`, `-t` | option  | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
 | 
				
			||||||
 | 
					| `--prune-vectors`, `-V` | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | 
				
			||||||
| `--vectors-name`, `-vn` | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | 
					| `--vectors-name`, `-vn` | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | 
				
			||||||
| **CREATES**             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
 | 
					| **CREATES**             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user