* Read in gold wsd data, as supersenses

This commit is contained in:
Matthew Honnibal 2015-07-03 04:47:23 +02:00
parent c60cc22390
commit 2be517ba6d
2 changed files with 16 additions and 2 deletions

View File

@ -1,6 +1,8 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .structs cimport TokenC from .structs cimport TokenC
from .typedefs cimport flags_t
from .syntax.transition_system cimport Transition from .syntax.transition_system cimport Transition
cimport numpy cimport numpy
@ -10,6 +12,7 @@ cdef struct GoldParseC:
int* tags int* tags
int* heads int* heads
int* labels int* labels
flags_t* ssenses
int** brackets int** brackets
Transition* ner Transition* ner
@ -25,6 +28,7 @@ cdef class GoldParse:
cdef readonly list heads cdef readonly list heads
cdef readonly list labels cdef readonly list labels
cdef readonly dict orths cdef readonly dict orths
cdef readonly list ssenses
cdef readonly list ner cdef readonly list ner
cdef readonly list ents cdef readonly list ents
cdef readonly dict brackets cdef readonly dict brackets

View File

@ -9,6 +9,8 @@ from os import path
from libc.string cimport memset from libc.string cimport memset
from .typedefs cimport flags_t
def tags_to_entities(tags): def tags_to_entities(tags):
entities = [] entities = []
@ -153,7 +155,7 @@ def read_json_file(loc, docs_filter=None):
if labels[-1].lower() == 'root': if labels[-1].lower() == 'root':
labels[-1] = 'ROOT' labels[-1] = 'ROOT'
ner.append(token.get('ner', '-')) ner.append(token.get('ner', '-'))
wsd.append(token.get('senses', [])) wsd.append(token.get('ssenses', []))
sents.append(( sents.append((
(ids, words, tags, heads, labels, ner, wsd), (ids, words, tags, heads, labels, ner, wsd),
sent.get('brackets', []))) sent.get('brackets', [])))
@ -204,6 +206,7 @@ cdef class GoldParse:
self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int)) self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int)) self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int)) self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c.ssenses = <flags_t*>self.mem.alloc(len(tokens), sizeof(flags_t))
self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition)) self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
self.c.brackets = <int**>self.mem.alloc(len(tokens), sizeof(int*)) self.c.brackets = <int**>self.mem.alloc(len(tokens), sizeof(int*))
for i in range(len(tokens)): for i in range(len(tokens)):
@ -213,21 +216,27 @@ cdef class GoldParse:
self.heads = [None] * len(tokens) self.heads = [None] * len(tokens)
self.labels = [''] * len(tokens) self.labels = [''] * len(tokens)
self.ner = ['-'] * len(tokens) self.ner = ['-'] * len(tokens)
self.ssenses = [[] for _ in range(len(tokens))]
self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1]) self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1])
self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens]) self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens])
self.orig_annot = zip(*annot_tuples) self.orig_annot = zip(*annot_tuples)
# This iterates 0...n for n words in the candidate, with an index
# gold_i aligned into the gold. Assign tag, label, ner and word sense.
# For the head, the value is an index into the gold sentence, so we
# have to translate it across into the candidate.
for i, gold_i in enumerate(self.cand_to_gold): for i, gold_i in enumerate(self.cand_to_gold):
if gold_i is None: if gold_i is None:
# TODO: What do we do for missing values again? # Missing values handled in the various oracle functions
pass pass
else: else:
self.tags[i] = annot_tuples[2][gold_i] self.tags[i] = annot_tuples[2][gold_i]
self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]] self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
self.labels[i] = annot_tuples[4][gold_i] self.labels[i] = annot_tuples[4][gold_i]
self.ner[i] = annot_tuples[5][gold_i] self.ner[i] = annot_tuples[5][gold_i]
self.ssenses[i] = annot_tuples[6][gold_i]
# If we have any non-projective arcs, i.e. crossing brackets, consider # If we have any non-projective arcs, i.e. crossing brackets, consider
# the heads for those words missing in the gold-standard. # the heads for those words missing in the gold-standard.
@ -246,6 +255,7 @@ cdef class GoldParse:
self.labels[w1] = '' self.labels[w1] = ''
self.heads[w2] = None self.heads[w2] = None
self.labels[w2] = '' self.labels[w2] = ''
self.ssenses[w2] = []
# Check there are no cycles in the dependencies, i.e. we are a tree # Check there are no cycles in the dependencies, i.e. we are a tree
for w in range(self.length): for w in range(self.length):