mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-05 14:10:34 +03:00
* Write a supersenses.json fil into a wsd directory in init_model
This commit is contained in:
parent
00c9acbf42
commit
4c6533a019
|
@ -22,6 +22,7 @@ from shutil import copyfile
|
|||
from shutil import copytree
|
||||
import codecs
|
||||
from collections import defaultdict
|
||||
import json
|
||||
|
||||
from spacy.en import get_lex_props
|
||||
from spacy.en.lemmatizer import Lemmatizer
|
||||
|
@ -31,6 +32,7 @@ from spacy.vocab import write_binary_vectors
|
|||
from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV
|
||||
|
||||
import spacy.senses
|
||||
from spacy.munge import read_wordnet
|
||||
|
||||
|
||||
def setup_tokenizer(lang_data_dir, tok_dir):
|
||||
|
@ -127,6 +129,7 @@ def setup_vocab(src_dir, dst_dir):
|
|||
vocab.strings.dump(str(dst_dir / 'strings.txt'))
|
||||
|
||||
|
||||
|
||||
def main(lang_data_dir, corpora_dir, model_dir):
|
||||
model_dir = Path(model_dir)
|
||||
lang_data_dir = Path(lang_data_dir)
|
||||
|
@ -142,6 +145,13 @@ def main(lang_data_dir, corpora_dir, model_dir):
|
|||
setup_vocab(corpora_dir, model_dir / 'vocab')
|
||||
if not (model_dir / 'wordnet').exists():
|
||||
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
|
||||
ss_probs = read_wordnet.make_supersense_dict(str(corpora_dir / 'wordnet'))
|
||||
|
||||
wsd_dir = Path(model_dir, 'wsd')
|
||||
if not wsd_dir.exists():
|
||||
wsd_dir.mkdir()
|
||||
with codecs.open(str(wsd_dir / 'supersenses.json'), 'w', 'utf8') as file_:
|
||||
json.dump(ss_probs, file_)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue
Block a user