mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Wire up new vocab command
This commit is contained in:
parent
aa64031751
commit
0fc1209421
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.cli import profile, evaluate, validate
|
||||
from spacy.cli import vocab, profile, evaluate, validate
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {
|
||||
|
@ -19,6 +19,7 @@ if __name__ == '__main__':
|
|||
'convert': convert,
|
||||
'package': package,
|
||||
'model': model,
|
||||
'model': vocab,
|
||||
'profile': profile,
|
||||
'validate': validate
|
||||
}
|
||||
|
|
|
@ -7,4 +7,5 @@ from .train import train
|
|||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
from .vocab import make_vocab as vocab
|
||||
from .validate import validate
|
||||
|
|
49
spacy/cli/vocab.py
Normal file
49
spacy/cli/vocab.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import json
|
||||
import spacy
|
||||
import numpy
|
||||
from spacy.util import ensure_path
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
||||
None, str),
|
||||
vectors_loc=("location of vectors data, as numpy .npz (optional)",
|
||||
"positional", None, str),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path))
|
||||
|
||||
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None):
|
||||
out_dir = ensure_path(output_dir)
|
||||
jsonl_loc = ensure_path(lexemes_loc)
|
||||
nlp = spacy.blank(lang)
|
||||
for word in nlp.vocab:
|
||||
word.rank = 0
|
||||
with jsonl_loc.open() as file_:
|
||||
for line in file_:
|
||||
if line.strip():
|
||||
attrs = json.loads(line)
|
||||
if 'settings' in attrs:
|
||||
nlp.vocab.cfg.update(attrs['settings'])
|
||||
else:
|
||||
lex = nlp.vocab[attrs['orth']]
|
||||
lex.set_attrs(**attrs)
|
||||
assert lex.rank == attrs['id']
|
||||
if vectors_loc is not None:
|
||||
vector_data = numpy.load(open(vectors_loc, 'rb'))
|
||||
nlp.vocab.clear_vectors(width=vector_data.shape[1])
|
||||
added = 0
|
||||
for word in nlp.vocab:
|
||||
if word.rank:
|
||||
nlp.vocab.vectors.add(word.orth_, row=word.rank,
|
||||
vector=vector_data[word.rank])
|
||||
added += 1
|
||||
nlp.to_disk(out_dir)
|
||||
return nlp
|
Loading…
Reference in New Issue
Block a user