* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization.

This commit is contained in:
Matthew Honnibal 2015-07-01 18:48:59 +02:00
parent aa3d06857e
commit 62cfcd76fe

View File

@ -21,11 +21,17 @@ from pathlib import Path
from shutil import copyfile from shutil import copyfile
from shutil import copytree from shutil import copytree
import codecs import codecs
from collections import defaultdict
from spacy.en import get_lex_props from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors from spacy.vocab import write_binary_vectors
from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.senses
def setup_tokenizer(lang_data_dir, tok_dir): def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists(): if not tok_dir.exists():
@ -72,6 +78,22 @@ def _read_probs(loc):
return probs return probs
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
for line in codecs.open(str(loc), 'r', 'utf8'):
sense_strings = line.split()
word = sense_strings.pop(0)
for sense in sense_strings:
pos, sense = sense[3:].split('.')
sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
if sense_name != 'N_tops':
sense_id = sense_names[sense_name]
lexicon[word][pos_ids[pos]].append(sense_id)
return lexicon
def setup_vocab(src_dir, dst_dir): def setup_vocab(src_dir, dst_dir):
if not dst_dir.exists(): if not dst_dir.exists():
dst_dir.mkdir() dst_dir.mkdir()
@ -81,10 +103,12 @@ def setup_vocab(src_dir, dst_dir):
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt') clusters = _read_clusters(src_dir / 'clusters.txt')
senses = _read_senses(src_dir / 'supersenses.txt')
probs = _read_probs(src_dir / 'words.sgt.prob') probs = _read_probs(src_dir / 'words.sgt.prob')
for word in clusters: for word in set(clusters).union(set(senses)):
if word not in probs: if word not in probs:
probs[word] = -17.0 probs[word] = -17.0
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
lexicon = [] lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
entry = get_lex_props(word) entry = get_lex_props(word)
@ -94,6 +118,17 @@ def setup_vocab(src_dir, dst_dir):
# Decode as a little-endian string, so that we can do & 15 to get # Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx # the first 4 bits. See _parse_features.pyx
entry['cluster'] = int(cluster[::-1], 2) entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set()
lemmas = []
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
lemmas.append(lemma)
orth_senses.update(senses[lemma][pos])
if word.lower() == 'dogging':
print word
print lemmas
print [spacy.senses.STRINGS[si] for si in orth_senses]
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt')) vocab.strings.dump(str(dst_dir / 'strings.txt'))