* Add new script to replace make_lexicon, that does full setup of data

2025-11-25 04:16:11 +03:00 · 2015-04-08 07:46:53 +02:00 · 2015-04-08 07:46:53 +02:00 · 156b70ed82
commit 156b70ed82
parent e775e05313
1 changed files with 77 additions and 0 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -0,0 +1,77 @@
+"""Set up a model dir, given the (committed) lang_data."""
+import plac
+from pathlib import Path
+
+from shutil import copyfile
+import codecs
+
+from spacy.en import get_lex_props
+from spacy.vocab import Vocab
+
+
+def setup_tokenizer(lang_data_dir, tok_dir):
+    if not tok_dir.exists():
+        tok_dir.mkdir()
+
+    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
+                     'suffix.txt'):
+        src = lang_data_dir / filename
+        dst = tok_dir / filename
+        if not dst.exists():
+            copyfile(src, dst)
+
+
+def _read_clusters(loc):
+    clusters = {}
+    for line in codecs.open(str(loc), 'r', 'utf8'):
+        try:
+            cluster, word, freq = line.split()
+        except ValueError:
+            continue
+        clusters[word] = cluster
+    return clusters
+
+
+def _read_probs(loc):
+    probs = {}
+    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
+        prob, word = line.split()
+        prob = float(prob)
+        probs[word] = prob
+    return probs
+
+
+
+def setup_vocab(src_dir, dst_dir):
+    if not dst_dir.exists():
+        dst_dir.mkdir()
+    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
+    clusters = _read_clusters(src_dir / 'clusters.txt')
+    probs = _read_probs(src_dir / 'words.sgt.prob')
+    lexicon = []
+    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
+        entry = get_lex_props(word)
+        if word in clusters or float(prob) >= -17:
+            entry['prob'] = float(prob)
+            cluster = clusters.get(word, '0')
+            # Decode as a little-endian string, so that we can do & 15 to get
+            # the first 4 bits. See _parse_features.pyx
+            entry['cluster'] = int(cluster[::-1], 2)
+            vocab[word] = entry
+    vocab.dump(str(dst_dir / 'lexemes.bin'))
+    vocab.strings.dump(str(dst_dir / 'strings.txt'))
+
+
+def main(lang_data_dir, model_dir):
+    model_dir = Path(model_dir)
+    lang_data_dir = Path(lang_data_dir)
+
+    if not model_dir.exists():
+        model_dir.mkdir()
+
+    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
+    setup_vocab(lang_data_dir, model_dir / 'vocab')
+
+
+if __name__ == '__main__':
+    plac.call(main)