spaCy/spacy/cli/vocab.py

# coding: utf8
from __future__ import unicode_literals

import plac
import json
import spacy
import numpy
from pathlib import Path

from ..vectors import Vectors
from ..util import prints, ensure_path


@plac.annotations(
    lang=("model language", "positional", None, str),
    output_dir=("model output directory", "positional", None, Path),
    lexemes_loc=("location of JSONL-formatted lexical data", "positional",
                 None, Path),
    vectors_loc=("optional: location of vectors data, as numpy .npz",
                 "positional", None, str),
    prune_vectors=("optional: number of vectors to prune to.",
                   "option", "V", int)
)
def make_vocab(cmd, lang, output_dir, lexemes_loc,
               vectors_loc=None, prune_vectors=-1):
    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
    if not lexemes_loc.exists():
        prints(lexemes_loc, title="Can't find lexical data", exits=1)
    vectors_loc = ensure_path(vectors_loc)
    nlp = spacy.blank(lang)
    for word in nlp.vocab:
        word.rank = 0
    lex_added = 0
    with lexemes_loc.open() as file_:
        for line in file_:
            if line.strip():
                attrs = json.loads(line)
                if 'settings' in attrs:
                    nlp.vocab.cfg.update(attrs['settings'])
                else:
                    lex = nlp.vocab[attrs['orth']]
                    lex.set_attrs(**attrs)
                    assert lex.rank == attrs['id']
                lex_added += 1
    if vectors_loc is not None:
        vector_data = numpy.load(vectors_loc.open('rb'))
        nlp.vocab.vectors = Vectors(data=vector_data)
        for word in nlp.vocab:
            if word.rank:
                nlp.vocab.vectors.add(word.orth, row=word.rank)

    if prune_vectors >= 1:
        remap = nlp.vocab.prune_vectors(prune_vectors)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    vec_added = len(nlp.vocab.vectors)
    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
           title="Sucessfully compiled vocab and vectors, and saved model")
    return nlp
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`import plac`
			`import json`
			`import spacy`
			`import numpy`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`from pathlib import Path`

Move vector pruning command into spacy vocab cli tool 2017-10-31 21:10:01 +03:00			`from ..vectors import Vectors`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`from ..util import prints, ensure_path`
Wire up new vocab command 2017-10-30 18:14:50 +03:00

			`@plac.annotations(`
			`lang=("model language", "positional", None, str),`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`output_dir=("model output directory", "positional", None, Path),`
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`lexemes_loc=("location of JSONL-formatted lexical data", "positional",`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`None, Path),`
			`vectors_loc=("optional: location of vectors data, as numpy .npz",`
Move vector pruning command into spacy vocab cli tool 2017-10-31 21:10:01 +03:00			`"positional", None, str),`
			`prune_vectors=("optional: number of vectors to prune to.",`
			`"option", "V", int)`
			`)`
			`def make_vocab(cmd, lang, output_dir, lexemes_loc,`
Make it easier to pass explicit no-pruning to vocab 2017-10-31 22:14:47 +03:00			`vectors_loc=None, prune_vectors=-1):`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`"""Compile a vocabulary from a lexicon jsonl file and word vectors."""`
			`if not lexemes_loc.exists():`
			`prints(lexemes_loc, title="Can't find lexical data", exits=1)`
			`vectors_loc = ensure_path(vectors_loc)`
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`nlp = spacy.blank(lang)`
			`for word in nlp.vocab:`
			`word.rank = 0`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`lex_added = 0`
			`with lexemes_loc.open() as file_:`
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`for line in file_:`
			`if line.strip():`
			`attrs = json.loads(line)`
			`if 'settings' in attrs:`
			`nlp.vocab.cfg.update(attrs['settings'])`
			`else:`
			`lex = nlp.vocab[attrs['orth']]`
			`lex.set_attrs(**attrs)`
			`assert lex.rank == attrs['id']`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`lex_added += 1`
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`if vectors_loc is not None:`
Move vector pruning command into spacy vocab cli tool 2017-10-31 21:10:01 +03:00			`vector_data = numpy.load(vectors_loc.open('rb'))`
			`nlp.vocab.vectors = Vectors(data=vector_data)`
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`for word in nlp.vocab:`
			`if word.rank:`
Move vector pruning command into spacy vocab cli tool 2017-10-31 21:10:01 +03:00			`nlp.vocab.vectors.add(word.orth, row=word.rank)`

Make it easier to pass explicit no-pruning to vocab 2017-10-31 22:14:47 +03:00			`if prune_vectors >= 1:`
Move vector pruning command into spacy vocab cli tool 2017-10-31 21:10:01 +03:00			`remap = nlp.vocab.prune_vectors(prune_vectors)`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`if not output_dir.exists():`
			`output_dir.mkdir()`
			`nlp.to_disk(output_dir)`
Move vector pruning command into spacy vocab cli tool 2017-10-31 21:10:01 +03:00			`vec_added = len(nlp.vocab.vectors)`
Fix spacy vocab command 2017-10-30 20:38:41 +03:00			`prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,`
			`title="Sucessfully compiled vocab and vectors, and saved model")`
Wire up new vocab command 2017-10-30 18:14:50 +03:00			`return nlp`