mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-29 13:23:40 +03:00
Fix spacy vocab command
This commit is contained in:
parent
d0cf12c8c7
commit
98c35d2585
|
@ -19,7 +19,7 @@ if __name__ == '__main__':
|
||||||
'convert': convert,
|
'convert': convert,
|
||||||
'package': package,
|
'package': package,
|
||||||
'model': model,
|
'model': model,
|
||||||
'model': vocab,
|
'vocab': vocab,
|
||||||
'profile': profile,
|
'profile': profile,
|
||||||
'validate': validate
|
'validate': validate
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,31 +1,33 @@
|
||||||
'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
|
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import plac
|
import plac
|
||||||
import json
|
import json
|
||||||
import spacy
|
import spacy
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.util import ensure_path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..util import prints, ensure_path
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
output_dir=("output directory to store model in", "positional", None, str),
|
output_dir=("model output directory", "positional", None, Path),
|
||||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
||||||
None, str),
|
None, Path),
|
||||||
vectors_loc=("location of vectors data, as numpy .npz (optional)",
|
vectors_loc=("optional: location of vectors data, as numpy .npz",
|
||||||
"positional", None, str),
|
"positional", None, str))
|
||||||
version=("Model version", "option", "V", str),
|
def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
|
||||||
)
|
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
|
||||||
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
|
if not lexemes_loc.exists():
|
||||||
out_dir = ensure_path(output_dir)
|
prints(lexemes_loc, title="Can't find lexical data", exits=1)
|
||||||
jsonl_loc = ensure_path(lexemes_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
for word in nlp.vocab:
|
for word in nlp.vocab:
|
||||||
word.rank = 0
|
word.rank = 0
|
||||||
with jsonl_loc.open() as file_:
|
lex_added = 0
|
||||||
|
vec_added = 0
|
||||||
|
with lexemes_loc.open() as file_:
|
||||||
for line in file_:
|
for line in file_:
|
||||||
if line.strip():
|
if line.strip():
|
||||||
attrs = json.loads(line)
|
attrs = json.loads(line)
|
||||||
|
@ -35,14 +37,18 @@ def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
|
||||||
lex = nlp.vocab[attrs['orth']]
|
lex = nlp.vocab[attrs['orth']]
|
||||||
lex.set_attrs(**attrs)
|
lex.set_attrs(**attrs)
|
||||||
assert lex.rank == attrs['id']
|
assert lex.rank == attrs['id']
|
||||||
|
lex_added += 1
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
vector_data = numpy.load(open(vectors_loc, 'rb'))
|
vector_data = numpy.load(open(vectors_loc, 'rb'))
|
||||||
nlp.vocab.clear_vectors(width=vector_data.shape[1])
|
nlp.vocab.clear_vectors(width=vector_data.shape[1])
|
||||||
added = 0
|
|
||||||
for word in nlp.vocab:
|
for word in nlp.vocab:
|
||||||
if word.rank:
|
if word.rank:
|
||||||
nlp.vocab.vectors.add(word.orth_, row=word.rank,
|
nlp.vocab.vectors.add(word.orth_, row=word.rank,
|
||||||
vector=vector_data[word.rank])
|
vector=vector_data[word.rank])
|
||||||
added += 1
|
vec_added += 1
|
||||||
nlp.to_disk(out_dir)
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
|
||||||
|
title="Sucessfully compiled vocab and vectors, and saved model")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
Loading…
Reference in New Issue
Block a user