Fix spacy vocab command

This commit is contained in:
ines 2017-10-30 18:38:41 +01:00
parent d0cf12c8c7
commit 98c35d2585
2 changed files with 23 additions and 17 deletions

View File

@ -19,7 +19,7 @@ if __name__ == '__main__':
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'model': model, 'model': model,
'model': vocab, 'vocab': vocab,
'profile': profile, 'profile': profile,
'validate': validate 'validate': validate
} }

View File

@ -1,31 +1,33 @@
'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from pathlib import Path
import plac import plac
import json import json
import spacy import spacy
import numpy import numpy
from spacy.util import ensure_path from pathlib import Path
from ..util import prints, ensure_path
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str), output_dir=("model output directory", "positional", None, Path),
lexemes_loc=("location of JSONL-formatted lexical data", "positional", lexemes_loc=("location of JSONL-formatted lexical data", "positional",
None, str), None, Path),
vectors_loc=("location of vectors data, as numpy .npz (optional)", vectors_loc=("optional: location of vectors data, as numpy .npz",
"positional", None, str), "positional", None, str))
version=("Model version", "option", "V", str), def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
) """Compile a vocabulary from a lexicon jsonl file and word vectors."""
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None): if not lexemes_loc.exists():
out_dir = ensure_path(output_dir) prints(lexemes_loc, title="Can't find lexical data", exits=1)
jsonl_loc = ensure_path(lexemes_loc) vectors_loc = ensure_path(vectors_loc)
nlp = spacy.blank(lang) nlp = spacy.blank(lang)
for word in nlp.vocab: for word in nlp.vocab:
word.rank = 0 word.rank = 0
with jsonl_loc.open() as file_: lex_added = 0
vec_added = 0
with lexemes_loc.open() as file_:
for line in file_: for line in file_:
if line.strip(): if line.strip():
attrs = json.loads(line) attrs = json.loads(line)
@ -35,14 +37,18 @@ def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
lex = nlp.vocab[attrs['orth']] lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs) lex.set_attrs(**attrs)
assert lex.rank == attrs['id'] assert lex.rank == attrs['id']
lex_added += 1
if vectors_loc is not None: if vectors_loc is not None:
vector_data = numpy.load(open(vectors_loc, 'rb')) vector_data = numpy.load(open(vectors_loc, 'rb'))
nlp.vocab.clear_vectors(width=vector_data.shape[1]) nlp.vocab.clear_vectors(width=vector_data.shape[1])
added = 0
for word in nlp.vocab: for word in nlp.vocab:
if word.rank: if word.rank:
nlp.vocab.vectors.add(word.orth_, row=word.rank, nlp.vocab.vectors.add(word.orth_, row=word.rank,
vector=vector_data[word.rank]) vector=vector_data[word.rank])
added += 1 vec_added += 1
nlp.to_disk(out_dir) if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
title="Sucessfully compiled vocab and vectors, and saved model")
return nlp return nlp