spaCy/spacy/cli/vocab.py

60 lines
2.1 KiB
Python
Raw Normal View History

2017-10-30 18:14:50 +03:00
# coding: utf8
from __future__ import unicode_literals
import plac
import json
import spacy
import numpy
2017-10-30 20:38:41 +03:00
from pathlib import Path
from ..vectors import Vectors
2017-10-30 20:38:41 +03:00
from ..util import prints, ensure_path
2017-10-30 18:14:50 +03:00
@plac.annotations(
lang=("model language", "positional", None, str),
2017-10-30 20:38:41 +03:00
output_dir=("model output directory", "positional", None, Path),
2017-10-30 18:14:50 +03:00
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
2017-10-30 20:38:41 +03:00
None, Path),
vectors_loc=("optional: location of vectors data, as numpy .npz",
"positional", None, str),
prune_vectors=("optional: number of vectors to prune to.",
"option", "V", int)
)
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
2017-10-30 20:38:41 +03:00
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
if not lexemes_loc.exists():
prints(lexemes_loc, title="Can't find lexical data", exits=1)
vectors_loc = ensure_path(vectors_loc)
2017-10-30 18:14:50 +03:00
nlp = spacy.blank(lang)
for word in nlp.vocab:
word.rank = 0
2017-10-30 20:38:41 +03:00
lex_added = 0
with lexemes_loc.open() as file_:
2017-10-30 18:14:50 +03:00
for line in file_:
if line.strip():
attrs = json.loads(line)
if 'settings' in attrs:
nlp.vocab.cfg.update(attrs['settings'])
else:
lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs)
assert lex.rank == attrs['id']
2017-10-30 20:38:41 +03:00
lex_added += 1
2017-10-30 18:14:50 +03:00
if vectors_loc is not None:
vector_data = numpy.load(vectors_loc.open('rb'))
nlp.vocab.vectors = Vectors(data=vector_data)
2017-10-30 18:14:50 +03:00
for word in nlp.vocab:
if word.rank:
nlp.vocab.vectors.add(word.orth, row=word.rank)
if prune_vectors >= 1:
remap = nlp.vocab.prune_vectors(prune_vectors)
2017-10-30 20:38:41 +03:00
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
vec_added = len(nlp.vocab.vectors)
2017-10-30 20:38:41 +03:00
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
title="Sucessfully compiled vocab and vectors, and saved model")
2017-10-30 18:14:50 +03:00
return nlp