spaCy/bin/init_model.py

"""Set up a model directory.

Requires:

    lang_data --- Rules for the tokenizer
        * prefix.txt
        * suffix.txt
        * infix.txt
        * morphs.json
        * specials.json

    corpora --- Data files
        * WordNet
        * words.sgt.prob --- Smoothed unigram probabilities
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.tgz --- output of something like word2vec
"""
from __future__ import unicode_literals

from ast import literal_eval
import math
import gzip
import json

import plac
from pathlib import Path

from shutil import copyfile
from shutil import copytree
import codecs
from collections import defaultdict

from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.strings import hash_string
from preshed.counter import PreshCounter

from spacy.parts_of_speech import NOUN, VERB, ADJ

import spacy.en
import spacy.de
import spacy.fi
import spacy.it


def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
        tok_dir.mkdir()

    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
                     'suffix.txt'):
        src = lang_data_dir / filename
        dst = tok_dir / filename
        copyfile(str(src), str(dst))


def _read_clusters(loc):
    if not loc.exists():
        print("Warning: Clusters file not found")
        return {}
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        try:
            cluster, word, freq = line.split()
        except ValueError:
            continue
        # If the clusterer has only seen the word a few times, its cluster is
        # unreliable.
        if int(freq) >= 3:
            clusters[word] = cluster
        else:
            clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters


def _read_probs(loc):
    if not loc.exists():
        print("Probabilities file not found. Trying freqs.")
        return {}, 0.0
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
        prob, word = line.split()
        prob = float(prob)
        probs[word] = prob
    return probs, probs['-OOV-']


def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
    counts = PreshCounter()
    total = 0
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    for i, line in enumerate(file_):
        freq, doc_freq, key = line.split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    for line in loc.open():
        freq, doc_freq, key = line.split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob


def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    if not loc.exists():
        print("Warning: WordNet senses not found")
        return lexicon
    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        sense_strings = line.split()
        word = sense_strings.pop(0)
        for sense in sense_strings:
            pos, sense = sense[3:].split('.')
            sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
            if sense_name != 'N_tops':
                sense_id = sense_names[sense_name]
                lexicon[word][pos_ids[pos]].append(sense_id)
    return lexicon


def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    else:
        print("Warning: Word vectors file not found")
    vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
        oov_prob = -20
    else:
        oov_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
            probs[word] = oov_prob

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
    with (dst_dir / 'oov_prob').open('w') as file_:
        file_.write('%f' % oov_prob)


def main(lang_id, lang_data_dir, corpora_dir, model_dir):
    languages = {
        'en': spacy.en.English.default_lex_attrs(),
        'de': spacy.de.German.default_lex_attrs(),
        'fi': spacy.fi.Finnish.default_lex_attrs(),
        'it': spacy.it.Italian.default_lex_attrs(),
    }

    model_dir = Path(model_dir)
    lang_data_dir = Path(lang_data_dir) / lang_id
    corpora_dir = Path(corpora_dir) / lang_id

    assert corpora_dir.exists()
    assert lang_data_dir.exists()

    if not model_dir.exists():
        model_dir.mkdir()

    tag_map = json.load((lang_data_dir / 'tag_map.json').open())
    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')

    if (lang_data_dir / 'gazetteer.json').exists():
        copyfile(str(lang_data_dir / 'gazetteer.json'),
                 str(model_dir / 'vocab' / 'gazetteer.json'))

    if (lang_data_dir / 'tag_map.json').exists():
        copyfile(str(lang_data_dir / 'tag_map.json'),
                 str(model_dir / 'vocab' / 'tag_map.json'))

    if (lang_data_dir / 'lemma_rules.json').exists():
        copyfile(str(lang_data_dir / 'lemma_rules.json'),
                 str(model_dir / 'vocab' / 'lemma_rules.json'))

    if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
        copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))


if __name__ == '__main__':
    plac.call(main)
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`"""Set up a model directory.`

			`Requires:`

			`lang_data --- Rules for the tokenizer`
			`* prefix.txt`
			`* suffix.txt`
			`* infix.txt`
			`* morphs.json`
			`* specials.json`

			`corpora --- Data files`
			`* WordNet`
			`* words.sgt.prob --- Smoothed unigram probabilities`
			`* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters`
			`* vectors.tgz --- output of something like word2vec`
			`"""`
* Fix structure of wordnet directory for init_model 2015-07-23 07:35:38 +03:00			`from __future__ import unicode_literals`

* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`from ast import literal_eval`
* Fix init_model 2015-07-25 23:56:35 +03:00			`import math`
* Support gzipped frequencies in init_model 2015-07-26 23:39:22 +03:00			`import gzip`
* Update init model 2015-09-06 18:51:30 +03:00			`import json`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`import plac`
			`from pathlib import Path`

			`from shutil import copyfile`
* Fix copying of tokenizer data in init_model 2015-04-12 05:45:31 +03:00			`from shutil import copytree`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`import codecs`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`from collections import defaultdict`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
			`from spacy.vocab import Vocab`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00			`from spacy.vocab import write_binary_vectors`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`from spacy.strings import hash_string`
			`from preshed.counter import PreshCounter`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`from spacy.parts_of_speech import NOUN, VERB, ADJ`

* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`import spacy.en`
			`import spacy.de`
* Add link for Finnish model 2015-08-27 11:26:02 +03:00			`import spacy.fi`
* Update init model 2015-09-06 18:51:30 +03:00			`import spacy.it`
* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00


* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
			`def setup_tokenizer(lang_data_dir, tok_dir):`
			`if not tok_dir.exists():`
			`tok_dir.mkdir()`

			`for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',`
			`'suffix.txt'):`
			`src = lang_data_dir / filename`
			`dst = tok_dir / filename`
* Ensure data files are copied for tokenizer in init_model 2015-07-26 02:36:19 +03:00			`copyfile(str(src), str(dst))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00

			`def _read_clusters(loc):`
* Update init_model, making language resources optional 2015-07-22 01:25:14 +03:00			`if not loc.exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 07:35:38 +03:00			`print("Warning: Clusters file not found")`
* Update init_model, making language resources optional 2015-07-22 01:25:14 +03:00			`return {}`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`clusters = {}`
			`for line in codecs.open(str(loc), 'r', 'utf8'):`
			`try:`
			`cluster, word, freq = line.split()`
			`except ValueError:`
			`continue`
* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable 2015-04-17 05:44:52 +03:00			`# If the clusterer has only seen the word a few times, its cluster is`
			`# unreliable.`
			`if int(freq) >= 3:`
			`clusters[word] = cluster`
* Add cluster=0 by default in init_model 2015-04-29 15:23:13 +03:00			`else:`
			`clusters[word] = '0'`
* Add case expansion to Brown clusters 2015-05-31 06:50:50 +03:00			`# Expand clusters with re-casing`
Py3 compatibility tweak 2015-07-23 14:13:15 +03:00			`for word, cluster in list(clusters.items()):`
* Add case expansion to Brown clusters 2015-05-31 06:50:50 +03:00			`if word.lower() not in clusters:`
			`clusters[word.lower()] = cluster`
			`if word.title() not in clusters:`
			`clusters[word.title()] = cluster`
* Fix cluster initialization 2015-05-31 16:21:28 +03:00			`if word.upper() not in clusters:`
* Add case expansion to Brown clusters 2015-05-31 06:50:50 +03:00			`clusters[word.upper()] = cluster`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`return clusters`


			`def _read_probs(loc):`
* Update init_model, making language resources optional 2015-07-22 01:25:14 +03:00			`if not loc.exists():`
* Support gzipped frequencies in init_model 2015-07-26 23:39:22 +03:00			`print("Probabilities file not found. Trying freqs.")`
* Fix init_model 2015-07-26 00:33:02 +03:00			`return {}, 0.0`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`probs = {}`
			`for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):`
			`prob, word = line.split()`
			`prob = float(prob)`
			`probs[word] = prob`
* Fix init_model 2015-07-26 00:33:02 +03:00			`return probs, probs['-OOV-']`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00

* Tighten the frequency filter in init_model 2015-07-27 22:44:51 +03:00			`def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):`
* Fix init_model 2015-07-25 23:54:08 +03:00			`if not loc.exists():`
			`print("Warning: Frequencies file not found")`
* Fix init_model for travis tests 2015-07-26 15:03:30 +03:00			`return {}, 0.0`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`counts = PreshCounter()`
			`total = 0`
* Support gzipped frequencies in init_model 2015-07-26 23:39:22 +03:00			`if str(loc).endswith('gz'):`
			`file_ = gzip.open(str(loc))`
			`else:`
			`file_ = loc.open()`
			`for i, line in enumerate(file_):`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`freq, doc_freq, key = line.split('\t', 2)`
			`freq = int(freq)`
* Fix init_model 2015-07-25 23:56:35 +03:00			`counts.inc(i+1, freq)`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`total += freq`
			`counts.smooth()`
			`log_total = math.log(total)`
			`probs = {}`
* Fix init_model 2015-07-25 23:56:35 +03:00			`for line in loc.open():`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`freq, doc_freq, key = line.split('\t', 2)`
* Make heuristics more explicit in init_model 2015-07-26 01:22:19 +03:00			`doc_freq = int(doc_freq)`
			`freq = int(freq)`
			`if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`word = literal_eval(key)`
			`smooth_count = counts.smoother(int(freq))`
			`log_smooth_count = math.log(smooth_count)`
			`probs[word] = math.log(smooth_count) - log_total`
* Pass OOV probability around 2015-07-26 00:29:51 +03:00			`oov_prob = math.log(counts.smoother(0)) - log_total`
			`return probs, oov_prob`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00

* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`def _read_senses(loc):`
			`lexicon = defaultdict(lambda: defaultdict(list))`
* Update init_model, making language resources optional 2015-07-22 01:25:14 +03:00			`if not loc.exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 07:35:38 +03:00			`print("Warning: WordNet senses not found")`
* Update init_model, making language resources optional 2015-07-22 01:25:14 +03:00			`return lexicon`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 19:48:59 +03:00			`sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))`
			`pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}`
			`for line in codecs.open(str(loc), 'r', 'utf8'):`
			`sense_strings = line.split()`
			`word = sense_strings.pop(0)`
			`for sense in sense_strings:`
			`pos, sense = sense[3:].split('.')`
			`sense_name = '%s_%s' % (pos[0].upper(), sense.lower())`
			`if sense_name != 'N_tops':`
			`sense_id = sense_names[sense_name]`
			`lexicon[word][pos_ids[pos]].append(sense_id)`
			`return lexicon`


* Update init model 2015-09-06 18:51:30 +03:00			`def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`if not dst_dir.exists():`
			`dst_dir.mkdir()`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00
			`vectors_src = src_dir / 'vectors.tgz'`
			`if vectors_src.exists():`
			`write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))`
* Update init_model, making language resources optional 2015-07-22 01:25:14 +03:00			`else:`
* Fix structure of wordnet directory for init_model 2015-07-23 07:35:38 +03:00			`print("Warning: Word vectors file not found")`
* Update init model 2015-09-06 18:51:30 +03:00			`vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`clusters = _read_clusters(src_dir / 'clusters.txt')`
* Pass OOV probability around 2015-07-26 00:29:51 +03:00			`probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')`
* Add read_freqs function in init_model 2015-07-25 23:16:36 +03:00			`if not probs:`
* Tighten the frequency filter in init_model 2015-07-27 22:44:51 +03:00			`probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')`
* Add cluster words to probs in init_model 2015-07-23 10:27:07 +03:00			`if not probs:`
* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`oov_prob = -20`
* Add cluster words to probs in init_model 2015-07-23 10:27:07 +03:00			`else:`
* Pass OOV probability around 2015-07-26 00:29:51 +03:00			`oov_prob = min(probs.values())`
* Add cluster words to probs in init_model 2015-07-23 10:27:07 +03:00			`for word in clusters:`
			`if word not in probs:`
* Pass OOV probability around 2015-07-26 00:29:51 +03:00			`probs[word] = oov_prob`
* Add cluster words to probs in init_model 2015-07-23 10:27:07 +03:00
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`lexicon = []`
Py3 compatibility tweak 2015-07-23 10:45:15 +03:00			`for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):`
* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`lexeme = vocab[word]`
			`lexeme.prob = prob`
			`lexeme.is_oov = False`
* Pass OOV probability around 2015-07-26 00:29:51 +03:00			`# Decode as a little-endian string, so that we can do & 15 to get`
			`# the first 4 bits. See _parse_features.pyx`
* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`if word in clusters:`
			`lexeme.cluster = int(clusters[word][::-1], 2)`
			`else:`
			`lexeme.cluster = 0`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`vocab.dump(str(dst_dir / 'lexemes.bin'))`
			`vocab.strings.dump(str(dst_dir / 'strings.txt'))`
* Pass OOV probability around 2015-07-26 00:29:51 +03:00			`with (dst_dir / 'oov_prob').open('w') as file_:`
			`file_.write('%f' % oov_prob)`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00

* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`def main(lang_id, lang_data_dir, corpora_dir, model_dir):`
			`languages = {`
* Refactor init_model to accomodate other languages 2015-08-26 20:14:05 +03:00			`'en': spacy.en.English.default_lex_attrs(),`
* Rename Deutsch to German 2015-09-06 21:18:58 +03:00			`'de': spacy.de.German.default_lex_attrs(),`
* Update init model 2015-09-06 18:51:30 +03:00			`'fi': spacy.fi.Finnish.default_lex_attrs(),`
			`'it': spacy.it.Italian.default_lex_attrs(),`
* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`}`

* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`model_dir = Path(model_dir)`
* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`lang_data_dir = Path(lang_data_dir) / lang_id`
			`corpora_dir = Path(corpora_dir) / lang_id`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 09:20:15 +03:00
			`assert corpora_dir.exists()`
			`assert lang_data_dir.exists()`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00
			`if not model_dir.exists():`
			`model_dir.mkdir()`

* Update init model 2015-09-06 18:51:30 +03:00			`tag_map = json.load((lang_data_dir / 'tag_map.json').open())`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00			`setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')`
* Update init model 2015-09-06 18:51:30 +03:00			`setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')`
* Copy gazetteer file in init_model 2015-08-06 17:07:23 +03:00
			`if (lang_data_dir / 'gazetteer.json').exists():`
			`copyfile(str(lang_data_dir / 'gazetteer.json'),`
			`str(model_dir / 'vocab' / 'gazetteer.json'))`
* Add link for Finnish model 2015-08-27 11:26:02 +03:00
* Copy tag_map.json in init_model 2015-09-12 06:54:02 +03:00			`if (lang_data_dir / 'tag_map.json').exists():`
			`copyfile(str(lang_data_dir / 'tag_map.json'),`
			`str(model_dir / 'vocab' / 'tag_map.json'))`

* Add link for Finnish model 2015-08-27 11:26:02 +03:00			`if (lang_data_dir / 'lemma_rules.json').exists():`
			`copyfile(str(lang_data_dir / 'lemma_rules.json'),`
			`str(model_dir / 'vocab' / 'lemma_rules.json'))`

* Cut down init_model to work on more languages 2015-08-24 02:05:20 +03:00			`if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 07:35:38 +03:00			`copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 08:46:53 +03:00

			`if __name__ == '__main__':`
			`plac.call(main)`