* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization.

2025-07-21 13:39:48 +03:00 · 2015-07-01 18:48:59 +02:00 · 2015-07-01 18:48:59 +02:00 · 62cfcd76fe
commit 62cfcd76fe
parent aa3d06857e
1 changed files with 36 additions and 1 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -21,11 +21,17 @@ from pathlib import Path
 from shutil import copyfile
 from shutil import copytree
 import codecs
+from collections import defaultdict

 from spacy.en import get_lex_props
+from spacy.en.lemmatizer import Lemmatizer
 from spacy.vocab import Vocab
 from spacy.vocab import write_binary_vectors

+from spacy.parts_of_speech import NOUN, VERB, ADJ
+
+import spacy.senses
+

 def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
@ -72,6 +78,22 @@ def _read_probs(loc):
    return probs


+def _read_senses(loc):
+    lexicon = defaultdict(lambda: defaultdict(list))
+    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
+    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
+    for line in codecs.open(str(loc), 'r', 'utf8'):
+        sense_strings = line.split()
+        word = sense_strings.pop(0)
+        for sense in sense_strings:
+            pos, sense = sense[3:].split('.')
+            sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
+            if sense_name != 'N_tops':
+                sense_id = sense_names[sense_name]
+                lexicon[word][pos_ids[pos]].append(sense_id)
+    return lexicon
+
+
 def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()
@ -81,10 +103,12 @@ def setup_vocab(src_dir, dst_dir):
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
+    senses = _read_senses(src_dir / 'supersenses.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
-    for word in clusters:
+    for word in set(clusters).union(set(senses)):
        if word not in probs:
            probs[word] = -17.0
+    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
        entry = get_lex_props(word)
@ -94,6 +118,17 @@ def setup_vocab(src_dir, dst_dir):
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
+            orth_senses = set()
+            lemmas = []
+            for pos in [NOUN, VERB, ADJ]:
+                for lemma in lemmatizer(word.lower(), pos):
+                    lemmas.append(lemma)
+                    orth_senses.update(senses[lemma][pos])
+            if word.lower() == 'dogging':
+                print word
+                print lemmas
+                print [spacy.senses.STRINGS[si] for si in orth_senses]
+            entry['senses'] = list(sorted(orth_senses))
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))