mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-15 22:27:12 +03:00
96 lines
3.3 KiB
Python
96 lines
3.3 KiB
Python
|
"""
|
||
|
Get a mapping of (lemma, sense_number)-->supersense, and a mapping
|
||
|
(lemma, ON group)-->(lemma, sense_number).
|
||
|
|
||
|
Then we can read the OntoNotes token-->(lemma, ON group) annotations, and resolve
|
||
|
them to the token-->supersense annotations we want to train from.
|
||
|
|
||
|
supersense: A WordNet lexical file number
|
||
|
sense_number: A WordNet sense key, found in e.g. wordnet/index.sense file
|
||
|
lex_filenum: A WordNet "super sense", or lexical file number.
|
||
|
onto_group: An OntoNotes sense grouping, which dominates zero or more WN senses.
|
||
|
"""
|
||
|
from __future__ import division
|
||
|
|
||
|
from os import path
|
||
|
import os
|
||
|
import re
|
||
|
import codecs
|
||
|
|
||
|
|
||
|
def get_sense_to_ssense(index_dot_sense_loc):
|
||
|
mapping = {}
|
||
|
pos_tags = [None, 'n', 'v', 'j', 'a', 's']
|
||
|
for line in codecs.open(index_dot_sense_loc, 'r', 'utf8'):
|
||
|
sense_key, synset_offset, sense_number, tag_cnt = line.split()
|
||
|
lemma, lex_sense = sense_key.split('%')
|
||
|
ss_type, lex_filenum, lex_id, head_word, head_id = lex_sense.split(':')
|
||
|
pos = pos_tags[int(ss_type)]
|
||
|
mapping[(lemma, pos, int(sense_number))] = int(lex_filenum)
|
||
|
return mapping
|
||
|
|
||
|
|
||
|
sense_group_re = re.compile(r'<sense .*?</sense>', re.DOTALL)
|
||
|
wn_mapping_re = re.compile(r'version="3.0">([^<]+)<')
|
||
|
def get_og_to_sense(sense_inv_dir):
|
||
|
mapping = {}
|
||
|
for filename in os.listdir(sense_inv_dir):
|
||
|
if not filename.endswith('.xml'):
|
||
|
continue
|
||
|
if '-' not in filename:
|
||
|
continue
|
||
|
lemma, pos = filename.split('-')[:2]
|
||
|
pos = pos[0]
|
||
|
# Word is these often don't validate, because of course. So, just parse
|
||
|
# with regex...
|
||
|
xml_str = open(path.join(sense_inv_dir, filename)).read()
|
||
|
for sense_grouping in sense_group_re.findall(xml_str):
|
||
|
group_num = sense_grouping.split('n="')[1].split('"')[0]
|
||
|
if not group_num:
|
||
|
continue
|
||
|
|
||
|
group_num = int(float(group_num))
|
||
|
key = (lemma, pos, int(group_num))
|
||
|
mapping.setdefault(key, [])
|
||
|
wn_elem = wn_mapping_re.search(sense_grouping)
|
||
|
if wn_elem is not None:
|
||
|
sense_num_str = wn_elem.groups()[0].replace('.', ',')
|
||
|
sense_ids = [(lemma, pos, int(n)) for n in sense_num_str.strip().split(',')]
|
||
|
mapping[key].extend(sense_ids)
|
||
|
return mapping
|
||
|
|
||
|
|
||
|
def get_lexnames(loc):
|
||
|
names = {}
|
||
|
for line in open(loc):
|
||
|
id_, name, syn_type = line.split()
|
||
|
names[int(id_)] = name
|
||
|
return names
|
||
|
|
||
|
|
||
|
def get_og_to_ssenses(wordnet_dir, onto_dir):
|
||
|
sense_inv_dir = path.join(onto_dir, 'data', 'english', 'metadata', 'sense-inventories')
|
||
|
og_to_sense = get_og_to_sense(sense_inv_dir)
|
||
|
sense_to_ssense = get_sense_to_ssense(path.join(wordnet_dir, 'index.sense'))
|
||
|
lexnames = get_lexnames(path.join(wordnet_dir, 'lexnames'))
|
||
|
|
||
|
mapping = {}
|
||
|
for key, senses in og_to_sense.items():
|
||
|
if senses is not None:
|
||
|
mapping[key] = set([lexnames[sense_to_ssense[s_key]]
|
||
|
for s_key in senses if s_key in sense_to_ssense])
|
||
|
return mapping
|
||
|
|
||
|
|
||
|
def main(wordnet_dir, onto_dir):
|
||
|
mapping = get_og_to_ssenses(wordnet_dir, onto_dir)
|
||
|
print mapping[('dog', 'v', 1)]
|
||
|
print mapping[('dog', 'n', 1)]
|
||
|
print mapping[('abandon', 'v', 1)]
|
||
|
print mapping[('abandon', 'n', 1)]
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import plac
|
||
|
plac.call(main)
|