"""Set up a model directory. Requires: lang_data --- Rules for the tokenizer * prefix.txt * suffix.txt * infix.txt * morphs.json * specials.json corpora --- Data files * WordNet * words.sgt.prob --- Smoothed unigram probabilities * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters * vectors.tgz --- output of something like word2vec """ import plac from pathlib import Path from shutil import copyfile from shutil import copytree import codecs from collections import defaultdict from spacy.en import get_lex_props from spacy.en.lemmatizer import Lemmatizer from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV import spacy.senses def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): tok_dir.mkdir() for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json', 'suffix.txt'): src = lang_data_dir / filename dst = tok_dir / filename if not dst.exists(): copyfile(str(src), str(dst)) def _read_clusters(loc): clusters = {} for line in codecs.open(str(loc), 'r', 'utf8'): try: cluster, word, freq = line.split() except ValueError: continue # If the clusterer has only seen the word a few times, its cluster is # unreliable. if int(freq) >= 3: clusters[word] = cluster else: clusters[word] = '0' # Expand clusters with re-casing for word, cluster in clusters.items(): if word.lower() not in clusters: clusters[word.lower()] = cluster if word.title() not in clusters: clusters[word.title()] = cluster if word.upper() not in clusters: clusters[word.upper()] = cluster return clusters def _read_probs(loc): probs = {} for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): prob, word = line.split() prob = float(prob) probs[word] = prob return probs def _read_senses(loc): lexicon = defaultdict(lambda: defaultdict(list)) pos_tags = [None, NOUN, VERB, ADJ, ADV, None] for line in codecs.open(str(loc), 'r', 'utf8'): sense_key, synset_offset, sense_number, tag_cnt = line.split() lemma, lex_sense = sense_key.split('%') ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':') pos = pos_tags[int(ss_type)] if pos is not None: lexicon[lemma][pos].append(int(lex_filenum) + 1) return lexicon def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') senses = _read_senses(src_dir / 'wordnet' / 'index.sense') probs = _read_probs(src_dir / 'words.sgt.prob') for word in set(clusters).union(set(senses)): if word not in probs: probs[word] = -17.0 lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) orth_senses = set() lemmas = [] for pos in [NOUN, VERB, ADJ]: for lemma in lemmatizer(word.lower(), pos): lemmas.append(lemma) orth_senses.update(senses[lemma][pos]) entry['senses'] = list(sorted(orth_senses)) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) def main(lang_data_dir, corpora_dir, model_dir): model_dir = Path(model_dir) lang_data_dir = Path(lang_data_dir) corpora_dir = Path(corpora_dir) assert corpora_dir.exists() assert lang_data_dir.exists() if not model_dir.exists(): model_dir.mkdir() setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_vocab(corpora_dir, model_dir / 'vocab') if not (model_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet')) if __name__ == '__main__': plac.call(main)