"""Set up a model directory. Requires: lang_data --- Rules for the tokenizer * prefix.txt * suffix.txt * infix.txt * morphs.json * specials.json corpora --- Data files * WordNet * words.sgt.prob --- Smoothed unigram probabilities * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters * vectors.tgz --- output of something like word2vec """ import plac from pathlib import Path from shutil import copyfile from shutil import copytree import codecs from collections import defaultdict import json from spacy.en import get_lex_props from spacy.en.lemmatizer import Lemmatizer from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV import spacy.senses from spacy.munge import read_wordnet def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): tok_dir.mkdir() for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json', 'suffix.txt'): src = lang_data_dir / filename dst = tok_dir / filename if not dst.exists(): copyfile(str(src), str(dst)) def _read_clusters(loc): clusters = {} for line in codecs.open(str(loc), 'r', 'utf8'): try: cluster, word, freq = line.split() except ValueError: continue # If the clusterer has only seen the word a few times, its cluster is # unreliable. if int(freq) >= 3: clusters[word] = cluster else: clusters[word] = '0' # Expand clusters with re-casing for word, cluster in clusters.items(): if word.lower() not in clusters: clusters[word.lower()] = cluster if word.title() not in clusters: clusters[word.title()] = cluster if word.upper() not in clusters: clusters[word.upper()] = cluster return clusters def _read_probs(loc): probs = {} for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): prob, word = line.split() prob = float(prob) probs[word] = prob return probs def _read_senses(loc): lexicon = defaultdict(lambda: defaultdict(list)) pos_tags = [None, NOUN, VERB, ADJ, None, None] for line in codecs.open(str(loc), 'r', 'utf8'): sense_key, synset_offset, sense_number, tag_cnt = line.split() lemma, lex_sense = sense_key.split('%') ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':') pos = pos_tags[int(ss_type)] lexicon[lemma][pos].append(int(lex_filenum) + 1) return lexicon def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') senses = _read_senses(src_dir / 'wordnet' / 'index.sense') probs = _read_probs(src_dir / 'words.sgt.prob') for word in set(clusters).union(set(senses)): if word not in probs: probs[word] = -17.0 lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or word in senses or float(prob) >= -17: entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) orth_senses = set() orth_senses.update(senses[word.lower()][None]) for pos in [NOUN, VERB, ADJ]: for lemma in lemmatizer(word.lower(), pos): orth_senses.update(senses[lemma][pos]) entry['senses'] = list(sorted(orth_senses)) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) def main(lang_data_dir, corpora_dir, model_dir): model_dir = Path(model_dir) lang_data_dir = Path(lang_data_dir) corpora_dir = Path(corpora_dir) assert corpora_dir.exists() assert lang_data_dir.exists() if not model_dir.exists(): model_dir.mkdir() setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_vocab(corpora_dir, model_dir / 'vocab') if not (model_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet')) ss_probs = read_wordnet.make_supersense_dict(str(corpora_dir / 'wordnet')) with codecs.open(str(model_dir / 'wordnet' / 'supersenses.json'), 'w', 'utf8') as file_: json.dump(ss_probs, file_) if __name__ == '__main__': plac.call(main)