mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
* Read in gold wsd data, as supersenses
This commit is contained in:
parent
c60cc22390
commit
2be517ba6d
|
@ -1,6 +1,8 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .typedefs cimport flags_t
|
||||
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
cimport numpy
|
||||
|
@ -10,6 +12,7 @@ cdef struct GoldParseC:
|
|||
int* tags
|
||||
int* heads
|
||||
int* labels
|
||||
flags_t* ssenses
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
||||
|
@ -25,6 +28,7 @@ cdef class GoldParse:
|
|||
cdef readonly list heads
|
||||
cdef readonly list labels
|
||||
cdef readonly dict orths
|
||||
cdef readonly list ssenses
|
||||
cdef readonly list ner
|
||||
cdef readonly list ents
|
||||
cdef readonly dict brackets
|
||||
|
|
|
@ -9,6 +9,8 @@ from os import path
|
|||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .typedefs cimport flags_t
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
|
@ -153,7 +155,7 @@ def read_json_file(loc, docs_filter=None):
|
|||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
wsd.append(token.get('senses', []))
|
||||
wsd.append(token.get('ssenses', []))
|
||||
sents.append((
|
||||
(ids, words, tags, heads, labels, ner, wsd),
|
||||
sent.get('brackets', [])))
|
||||
|
@ -204,6 +206,7 @@ cdef class GoldParse:
|
|||
self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||
self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||
self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||
self.c.ssenses = <flags_t*>self.mem.alloc(len(tokens), sizeof(flags_t))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
||||
self.c.brackets = <int**>self.mem.alloc(len(tokens), sizeof(int*))
|
||||
for i in range(len(tokens)):
|
||||
|
@ -213,21 +216,27 @@ cdef class GoldParse:
|
|||
self.heads = [None] * len(tokens)
|
||||
self.labels = [''] * len(tokens)
|
||||
self.ner = ['-'] * len(tokens)
|
||||
self.ssenses = [[] for _ in range(len(tokens))]
|
||||
|
||||
self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1])
|
||||
self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens])
|
||||
|
||||
self.orig_annot = zip(*annot_tuples)
|
||||
|
||||
# This iterates 0...n for n words in the candidate, with an index
|
||||
# gold_i aligned into the gold. Assign tag, label, ner and word sense.
|
||||
# For the head, the value is an index into the gold sentence, so we
|
||||
# have to translate it across into the candidate.
|
||||
for i, gold_i in enumerate(self.cand_to_gold):
|
||||
if gold_i is None:
|
||||
# TODO: What do we do for missing values again?
|
||||
# Missing values handled in the various oracle functions
|
||||
pass
|
||||
else:
|
||||
self.tags[i] = annot_tuples[2][gold_i]
|
||||
self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
|
||||
self.labels[i] = annot_tuples[4][gold_i]
|
||||
self.ner[i] = annot_tuples[5][gold_i]
|
||||
self.ssenses[i] = annot_tuples[6][gold_i]
|
||||
|
||||
# If we have any non-projective arcs, i.e. crossing brackets, consider
|
||||
# the heads for those words missing in the gold-standard.
|
||||
|
@ -246,6 +255,7 @@ cdef class GoldParse:
|
|||
self.labels[w1] = ''
|
||||
self.heads[w2] = None
|
||||
self.labels[w2] = ''
|
||||
self.ssenses[w2] = []
|
||||
|
||||
# Check there are no cycles in the dependencies, i.e. we are a tree
|
||||
for w in range(self.length):
|
||||
|
|
Loading…
Reference in New Issue
Block a user