spaCy/bin/init_model.py

"""Set up a model directory.

Requires:

    lang_data --- Rules for the tokenizer
        * prefix.txt
        * suffix.txt
        * infix.txt
        * morphs.json
        * specials.json

    corpora --- Data files
        * WordNet
        * words.sgt.prob --- Smoothed unigram probabilities
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.tgz --- output of something like word2vec
"""
import plac
from pathlib import Path

from shutil import copyfile
from shutil import copytree
import codecs
from collections import defaultdict

from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors

from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV

import spacy.senses


def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
        tok_dir.mkdir()

    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
                     'suffix.txt'):
        src = lang_data_dir / filename
        dst = tok_dir / filename
        if not dst.exists():
            copyfile(str(src), str(dst))


def _read_clusters(loc):
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        try:
            cluster, word, freq = line.split()
        except ValueError:
            continue
        # If the clusterer has only seen the word a few times, its cluster is
        # unreliable.
        if int(freq) >= 3:
            clusters[word] = cluster
        else:
            clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in clusters.items():
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters


def _read_probs(loc):
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
        prob, word = line.split()
        prob = float(prob)
        probs[word] = prob
    return probs


def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    pos_tags = [None, NOUN, VERB, ADJ, ADV, None]
    for line in codecs.open(str(loc), 'r', 'utf8'):
        sense_key, synset_offset, sense_number, tag_cnt = line.split()
        lemma, lex_sense = sense_key.split('%')
        ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
        pos = pos_tags[int(ss_type)]
        if pos is not None:
            lexicon[lemma][pos].append(int(lex_filenum))
    return lexicon


def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    senses = _read_senses(src_dir / 'wordnet' / 'index.sense')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in set(clusters).union(set(senses)):
        if word not in probs:
            probs[word] = -17.0
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or float(prob) >= -17:
            entry['prob'] = float(prob)
            cluster = clusters.get(word, '0')
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
            orth_senses = set()
            lemmas = []
            for pos in [NOUN, VERB, ADJ]:
                for lemma in lemmatizer(word.lower(), pos):
                    lemmas.append(lemma)
                    orth_senses.update(senses[lemma][pos])
            entry['senses'] = list(sorted(orth_senses))
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))


def main(lang_data_dir, corpora_dir, model_dir):
    model_dir = Path(model_dir)
    lang_data_dir = Path(lang_data_dir)
    corpora_dir = Path(corpora_dir)

    assert corpora_dir.exists()
    assert lang_data_dir.exists()

    if not model_dir.exists():
        model_dir.mkdir()

    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(corpora_dir, model_dir / 'vocab')
    if not (model_dir / 'wordnet').exists():
        copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))


if __name__ == '__main__':
    plac.call(main)
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`"""Set up a model directory.`

			`Requires:`

			`lang_data --- Rules for the tokenizer`
			`* prefix.txt`
			`* suffix.txt`
			`* infix.txt`
			`* morphs.json`
			`* specials.json`

			`corpora --- Data files`
			`* WordNet`
			`* words.sgt.prob --- Smoothed unigram probabilities`
			`* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters`
			`* vectors.tgz --- output of something like word2vec`
			`"""`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`import plac`
			`from pathlib import Path`

			`from shutil import copyfile`
* Fix copying of tokenizer data in init_model 2015-04-12 05:45:31 +03:00			`from shutil import copytree`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`import codecs`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`from collections import defaultdict`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
			`from spacy.en import get_lex_props`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`from spacy.en.lemmatizer import Lemmatizer`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`from spacy.vocab import Vocab`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`from spacy.vocab import write_binary_vectors`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
* Fix init_model to read supersenses from wordnet, not pre-computed supersenses file 2015-07-03 14:28:39 +03:00			`from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00
			`import spacy.senses`

* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
			`def setup_tokenizer(lang_data_dir, tok_dir):`
			`if not tok_dir.exists():`
			`tok_dir.mkdir()`

			`for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',`
			`'suffix.txt'):`
			`src = lang_data_dir / filename`
			`dst = tok_dir / filename`
			`if not dst.exists():`
* Fix copying of tokenizer data in init_model 2015-04-12 05:45:31 +03:00			`copyfile(str(src), str(dst))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00

			`def _read_clusters(loc):`
			`clusters = {}`
			`for line in codecs.open(str(loc), 'r', 'utf8'):`
			`try:`
			`cluster, word, freq = line.split()`
			`except ValueError:`
			`continue`
* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable 2015-04-17 05:44:52 +03:00			`# If the clusterer has only seen the word a few times, its cluster is`
			`# unreliable.`
			`if int(freq) >= 3:`
			`clusters[word] = cluster`
* Add cluster=0 by default in init_model 2015-04-29 15:23:13 +03:00			`else:`
			`clusters[word] = '0'`
* Add case expansion to Brown clusters 2015-05-31 06:50:50 +03:00			`# Expand clusters with re-casing`
			`for word, cluster in clusters.items():`
			`if word.lower() not in clusters:`
			`clusters[word.lower()] = cluster`
			`if word.title() not in clusters:`
			`clusters[word.title()] = cluster`
* Fix cluster initialization 2015-05-31 16:21:28 +03:00			`if word.upper() not in clusters:`
* Add case expansion to Brown clusters 2015-05-31 06:50:50 +03:00			`clusters[word.upper()] = cluster`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`return clusters`


			`def _read_probs(loc):`
			`probs = {}`
			`for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):`
			`prob, word = line.split()`
			`prob = float(prob)`
			`probs[word] = prob`
			`return probs`


* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`def _read_senses(loc):`
			`lexicon = defaultdict(lambda: defaultdict(list))`
* Fix init_model to read supersenses from wordnet, not pre-computed supersenses file 2015-07-03 14:28:39 +03:00			`pos_tags = [None, NOUN, VERB, ADJ, ADV, None]`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`for line in codecs.open(str(loc), 'r', 'utf8'):`
* Fix init_model to read supersenses from wordnet, not pre-computed supersenses file 2015-07-03 14:28:39 +03:00			`sense_key, synset_offset, sense_number, tag_cnt = line.split()`
			`lemma, lex_sense = sense_key.split('%')`
			`ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')`
			`pos = pos_tags[int(ss_type)]`
			`if pos is not None:`
			`lexicon[lemma][pos].append(int(lex_filenum))`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`return lexicon`


* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`def setup_vocab(src_dir, dst_dir):`
			`if not dst_dir.exists():`
			`dst_dir.mkdir()`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00
			`vectors_src = src_dir / 'vectors.tgz'`
			`if vectors_src.exists():`
			`write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)`
			`clusters = _read_clusters(src_dir / 'clusters.txt')`
* Fix init_model to read supersenses from wordnet, not pre-computed supersenses file 2015-07-03 14:28:39 +03:00			`senses = _read_senses(src_dir / 'wordnet' / 'index.sense')`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`probs = _read_probs(src_dir / 'words.sgt.prob')`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`for word in set(clusters).union(set(senses)):`
* Ensure words in Brown clusters make it into the vocab, even if they're not in our probs list 2015-05-31 06:46:16 +03:00			`if word not in probs:`
			`probs[word] = -17.0`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`lexicon = []`
			`for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):`
			`entry = get_lex_props(word)`
			`if word in clusters or float(prob) >= -17:`
			`entry['prob'] = float(prob)`
			`cluster = clusters.get(word, '0')`
			`# Decode as a little-endian string, so that we can do & 15 to get`
			`# the first 4 bits. See _parse_features.pyx`
			`entry['cluster'] = int(cluster[::-1], 2)`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`orth_senses = set()`
			`lemmas = []`
			`for pos in [NOUN, VERB, ADJ]:`
			`for lemma in lemmatizer(word.lower(), pos):`
			`lemmas.append(lemma)`
			`orth_senses.update(senses[lemma][pos])`
			`entry['senses'] = list(sorted(orth_senses))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`vocab[word] = entry`
			`vocab.dump(str(dst_dir / 'lexemes.bin'))`
			`vocab.strings.dump(str(dst_dir / 'strings.txt'))`


* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`def main(lang_data_dir, corpora_dir, model_dir):`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`model_dir = Path(model_dir)`
			`lang_data_dir = Path(lang_data_dir)`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`corpora_dir = Path(corpora_dir)`

			`assert corpora_dir.exists()`
			`assert lang_data_dir.exists()`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
			`if not model_dir.exists():`
			`model_dir.mkdir()`

			`setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`setup_vocab(corpora_dir, model_dir / 'vocab')`
			`if not (model_dir / 'wordnet').exists():`
			`copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00

			`if __name__ == '__main__':`
			`plac.call(main)`