mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 19:33:42 +03:00
* Read in gold wsd data, as supersenses
This commit is contained in:
parent
c60cc22390
commit
2be517ba6d
|
@ -1,6 +1,8 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
|
from .typedefs cimport flags_t
|
||||||
|
|
||||||
from .syntax.transition_system cimport Transition
|
from .syntax.transition_system cimport Transition
|
||||||
|
|
||||||
cimport numpy
|
cimport numpy
|
||||||
|
@ -10,6 +12,7 @@ cdef struct GoldParseC:
|
||||||
int* tags
|
int* tags
|
||||||
int* heads
|
int* heads
|
||||||
int* labels
|
int* labels
|
||||||
|
flags_t* ssenses
|
||||||
int** brackets
|
int** brackets
|
||||||
Transition* ner
|
Transition* ner
|
||||||
|
|
||||||
|
@ -25,6 +28,7 @@ cdef class GoldParse:
|
||||||
cdef readonly list heads
|
cdef readonly list heads
|
||||||
cdef readonly list labels
|
cdef readonly list labels
|
||||||
cdef readonly dict orths
|
cdef readonly dict orths
|
||||||
|
cdef readonly list ssenses
|
||||||
cdef readonly list ner
|
cdef readonly list ner
|
||||||
cdef readonly list ents
|
cdef readonly list ents
|
||||||
cdef readonly dict brackets
|
cdef readonly dict brackets
|
||||||
|
|
|
@ -9,6 +9,8 @@ from os import path
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
|
from .typedefs cimport flags_t
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
entities = []
|
entities = []
|
||||||
|
@ -153,7 +155,7 @@ def read_json_file(loc, docs_filter=None):
|
||||||
if labels[-1].lower() == 'root':
|
if labels[-1].lower() == 'root':
|
||||||
labels[-1] = 'ROOT'
|
labels[-1] = 'ROOT'
|
||||||
ner.append(token.get('ner', '-'))
|
ner.append(token.get('ner', '-'))
|
||||||
wsd.append(token.get('senses', []))
|
wsd.append(token.get('ssenses', []))
|
||||||
sents.append((
|
sents.append((
|
||||||
(ids, words, tags, heads, labels, ner, wsd),
|
(ids, words, tags, heads, labels, ner, wsd),
|
||||||
sent.get('brackets', [])))
|
sent.get('brackets', [])))
|
||||||
|
@ -204,6 +206,7 @@ cdef class GoldParse:
|
||||||
self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
|
self.c.ssenses = <flags_t*>self.mem.alloc(len(tokens), sizeof(flags_t))
|
||||||
self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
||||||
self.c.brackets = <int**>self.mem.alloc(len(tokens), sizeof(int*))
|
self.c.brackets = <int**>self.mem.alloc(len(tokens), sizeof(int*))
|
||||||
for i in range(len(tokens)):
|
for i in range(len(tokens)):
|
||||||
|
@ -213,21 +216,27 @@ cdef class GoldParse:
|
||||||
self.heads = [None] * len(tokens)
|
self.heads = [None] * len(tokens)
|
||||||
self.labels = [''] * len(tokens)
|
self.labels = [''] * len(tokens)
|
||||||
self.ner = ['-'] * len(tokens)
|
self.ner = ['-'] * len(tokens)
|
||||||
|
self.ssenses = [[] for _ in range(len(tokens))]
|
||||||
|
|
||||||
self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1])
|
self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1])
|
||||||
self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens])
|
self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens])
|
||||||
|
|
||||||
self.orig_annot = zip(*annot_tuples)
|
self.orig_annot = zip(*annot_tuples)
|
||||||
|
|
||||||
|
# This iterates 0...n for n words in the candidate, with an index
|
||||||
|
# gold_i aligned into the gold. Assign tag, label, ner and word sense.
|
||||||
|
# For the head, the value is an index into the gold sentence, so we
|
||||||
|
# have to translate it across into the candidate.
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
# TODO: What do we do for missing values again?
|
# Missing values handled in the various oracle functions
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
self.tags[i] = annot_tuples[2][gold_i]
|
self.tags[i] = annot_tuples[2][gold_i]
|
||||||
self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
|
self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
|
||||||
self.labels[i] = annot_tuples[4][gold_i]
|
self.labels[i] = annot_tuples[4][gold_i]
|
||||||
self.ner[i] = annot_tuples[5][gold_i]
|
self.ner[i] = annot_tuples[5][gold_i]
|
||||||
|
self.ssenses[i] = annot_tuples[6][gold_i]
|
||||||
|
|
||||||
# If we have any non-projective arcs, i.e. crossing brackets, consider
|
# If we have any non-projective arcs, i.e. crossing brackets, consider
|
||||||
# the heads for those words missing in the gold-standard.
|
# the heads for those words missing in the gold-standard.
|
||||||
|
@ -246,6 +255,7 @@ cdef class GoldParse:
|
||||||
self.labels[w1] = ''
|
self.labels[w1] = ''
|
||||||
self.heads[w2] = None
|
self.heads[w2] = None
|
||||||
self.labels[w2] = ''
|
self.labels[w2] = ''
|
||||||
|
self.ssenses[w2] = []
|
||||||
|
|
||||||
# Check there are no cycles in the dependencies, i.e. we are a tree
|
# Check there are no cycles in the dependencies, i.e. we are a tree
|
||||||
for w in range(self.length):
|
for w in range(self.length):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user