spaCy/bin/init_model.py

"""Set up a model directory.

Requires:

    lang_data --- Rules for the tokenizer
        * prefix.txt
        * suffix.txt
        * infix.txt
        * morphs.json
        * specials.json

    corpora --- Data files
        * WordNet
        * words.sgt.prob --- Smoothed unigram probabilities
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.tgz --- output of something like word2vec
"""
import plac
from pathlib import Path

from shutil import copyfile
from shutil import copytree
import codecs
from collections import defaultdict
import json

from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors

from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV

import spacy.senses
from spacy.munge import read_wordnet


def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
        tok_dir.mkdir()

    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
                     'suffix.txt'):
        src = lang_data_dir / filename
        dst = tok_dir / filename
        if not dst.exists():
            copyfile(str(src), str(dst))


def _read_clusters(loc):
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        try:
            cluster, word, freq = line.split()
        except ValueError:
            continue
        # If the clusterer has only seen the word a few times, its cluster is
        # unreliable.
        if int(freq) >= 3:
            clusters[word] = cluster
        else:
            clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in clusters.items():
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters


def _read_probs(loc):
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
        prob, word = line.split()
        prob = float(prob)
        probs[word] = prob
    return probs


def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    pos_tags = [None, NOUN, VERB, ADJ, None, None]
    for line in codecs.open(str(loc), 'r', 'utf8'):
        sense_key, synset_offset, sense_number, tag_cnt = line.split()
        lemma, lex_sense = sense_key.split('%')
        ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
        pos = pos_tags[int(ss_type)]
        lexicon[lemma][pos].append(int(lex_filenum) + 1)
    return lexicon


def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    senses = _read_senses(src_dir / 'wordnet' / 'index.sense')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in set(clusters).union(set(senses)):
        if word not in probs:
            probs[word] = -17.0
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or word in senses or float(prob) >= -17:
            entry['prob'] = float(prob)
            cluster = clusters.get(word, '0')
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
            orth_senses = set()
            orth_senses.update(senses[word.lower()][None])
            for pos in [NOUN, VERB, ADJ]:
                for lemma in lemmatizer(word.lower(), pos):
                    orth_senses.update(senses[lemma][pos])
            entry['senses'] = list(sorted(orth_senses))
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))


def main(lang_data_dir, corpora_dir, model_dir):
    model_dir = Path(model_dir)
    lang_data_dir = Path(lang_data_dir)
    corpora_dir = Path(corpora_dir)

    assert corpora_dir.exists()
    assert lang_data_dir.exists()

    if not model_dir.exists():
        model_dir.mkdir()

    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(corpora_dir, model_dir / 'vocab')

    if not (model_dir / 'wordnet').exists():
        copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
    ss_probs = read_wordnet.make_supersense_dict(str(corpora_dir / 'wordnet'))
    with codecs.open(str(model_dir / 'wordnet' / 'supersenses.json'), 'w', 'utf8') as file_:
        json.dump(ss_probs, file_)


if __name__ == '__main__':
    plac.call(main)