spaCy/bin/init_model.py

154 lines
4.9 KiB
Python

"""Set up a model directory.
Requires:
lang_data --- Rules for the tokenizer
* prefix.txt
* suffix.txt
* infix.txt
* morphs.json
* specials.json
corpora --- Data files
* WordNet
* words.sgt.prob --- Smoothed unigram probabilities
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
* vectors.tgz --- output of something like word2vec
"""
import plac
from pathlib import Path
from shutil import copyfile
from shutil import copytree
import codecs
from collections import defaultdict
import json
from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV
import spacy.senses
from spacy.munge import read_wordnet
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
tok_dir.mkdir()
for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
'suffix.txt'):
src = lang_data_dir / filename
dst = tok_dir / filename
if not dst.exists():
copyfile(str(src), str(dst))
def _read_clusters(loc):
clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'):
try:
cluster, word, freq = line.split()
except ValueError:
continue
# If the clusterer has only seen the word a few times, its cluster is
# unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in clusters.items():
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def _read_probs(loc):
probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
prob, word = line.split()
prob = float(prob)
probs[word] = prob
return probs
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
pos_tags = [None, NOUN, VERB, ADJ, None, None]
for line in codecs.open(str(loc), 'r', 'utf8'):
sense_key, synset_offset, sense_number, tag_cnt = line.split()
lemma, lex_sense = sense_key.split('%')
ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
pos = pos_tags[int(ss_type)]
lexicon[lemma][pos].append(int(lex_filenum) + 1)
return lexicon
def setup_vocab(src_dir, dst_dir):
if not dst_dir.exists():
dst_dir.mkdir()
vectors_src = src_dir / 'vectors.tgz'
if vectors_src.exists():
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
senses = _read_senses(src_dir / 'wordnet' / 'index.sense')
probs = _read_probs(src_dir / 'words.sgt.prob')
for word in set(clusters).union(set(senses)):
if word not in probs:
probs[word] = -17.0
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
entry = get_lex_props(word)
if word in clusters or word in senses or float(prob) >= -17:
entry['prob'] = float(prob)
cluster = clusters.get(word, '0')
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set()
orth_senses.update(senses[word.lower()][None])
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
orth_senses.update(senses[lemma][pos])
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))
def main(lang_data_dir, corpora_dir, model_dir):
model_dir = Path(model_dir)
lang_data_dir = Path(lang_data_dir)
corpora_dir = Path(corpora_dir)
assert corpora_dir.exists()
assert lang_data_dir.exists()
if not model_dir.exists():
model_dir.mkdir()
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(corpora_dir, model_dir / 'vocab')
if not (model_dir / 'wordnet').exists():
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
ss_probs = read_wordnet.make_supersense_dict(str(corpora_dir / 'wordnet'))
with codecs.open(str(model_dir / 'wordnet' / 'supersenses.json'), 'w', 'utf8') as file_:
json.dump(ss_probs, file_)
if __name__ == '__main__':
plac.call(main)