* Fix POS and dependency label tag names. Add parse and string navigation functions.

This commit is contained in:
Matthew Honnibal 2015-01-24 17:29:04 +11:00
parent cb6a526fcd
commit a97bed9359
6 changed files with 64 additions and 10 deletions

View File

@ -252,7 +252,7 @@ cdef class EnPosTagger:
scores = self.model.score(context) scores = self.model.score(context)
tokens.data[i].tag = arg_max(scores, self.model.n_classes) tokens.data[i].tag = arg_max(scores, self.model.n_classes)
self.set_morph(i, tokens.data) self.set_morph(i, tokens.data)
tokens.pos_scheme = self.tag_map tokens._tag_strings = self.tag_names
def train(self, Tokens tokens, object golds): def train(self, Tokens tokens, object golds):
cdef int i cdef int i

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
from ._state cimport State from ._state cimport State
from ._state cimport has_head, get_idx, get_s0, get_n0 from ._state cimport has_head, get_idx, get_s0, get_n0
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
@ -106,12 +108,14 @@ cdef class TransitionSystem:
self.label_ids = {'ROOT': 0} self.label_ids = {'ROOT': 0}
cdef int label_id cdef int label_id
for label_str in left_labels: for label_str in left_labels:
label_str = unicode(label_str)
label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
moves[i].move = LEFT moves[i].move = LEFT
moves[i].label = label_id moves[i].label = label_id
moves[i].clas = i moves[i].clas = i
i += 1 i += 1
for label_str in right_labels: for label_str in right_labels:
label_str = unicode(label_str)
label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
moves[i].move = RIGHT moves[i].move = RIGHT
moves[i].label = label_id moves[i].label = label_id

View File

@ -79,6 +79,10 @@ cdef class GreedyParser:
scores = self.model.score(context) scores = self.model.score(context)
guess = self.moves.best_valid(scores, state) guess = self.moves.best_valid(scores, state)
self.moves.transition(state, &guess) self.moves.transition(state, &guess)
# Messily tell Tokens object the string names of the dependency labels
tokens._dep_strings = [None] * len(self.moves.label_ids)
for label, id_ in self.moves.label_ids.items():
tokens._dep_strings[id_] = label
return 0 return 0
def train_sent(self, Tokens tokens, list gold_heads, list gold_labels): def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):

View File

@ -1,6 +1,7 @@
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from numpy cimport ndarray from numpy cimport ndarray
cimport numpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
cdef Vocab vocab cdef Vocab vocab
cdef list tag_names
cdef dict pos_scheme
cdef TokenC* data cdef TokenC* data
cdef unicode _string cdef unicode _string
cdef list _tag_strings
cdef list _dep_strings
cdef int length cdef int length
cdef int max_length cdef int max_length

View File

@ -1,5 +1,4 @@
# cython: embedsignature=True # cython: embedsignature=True
from cython.view cimport array as cvarray
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter from preshed.counter cimport PreshCounter
@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
from .typedefs import UNIV_TAG_NAMES
from unidecode import unidecode from unidecode import unidecode
@ -84,6 +84,8 @@ cdef class Tokens:
self.data = data_start + PADDING self.data = data_start + PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
self._tag_strings = [] # These will be set by the POS tagger and parser
self._dep_strings = [] # The strings are arbitrary and model-specific.
def sentences(self): def sentences(self):
cdef int i cdef int i
@ -148,7 +150,7 @@ cdef class Tokens:
return idx + t.lex.length return idx + t.lex.length
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef long[:,:] to_array(self, object attr_ids): cpdef long[:,:] to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray """Given a list of M attribute IDs, export the tokens to a numpy ndarray
of shape N*M, where N is the length of the sentence. of shape N*M, where N is the length of the sentence.
@ -162,8 +164,11 @@ cdef class Tokens:
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)), cdef numpy.ndarray[long, ndim=2] output
itemsize=sizeof(long), format="l") # Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.data[i], feature) output[i, j] = get_token_attr(&self.data[i], feature)
@ -232,6 +237,7 @@ cdef class Token:
self.sentiment = t.lex.sentiment self.sentiment = t.lex.sentiment
self.flags = t.lex.flags self.flags = t.lex.flags
self.lemma = t.lemma self.lemma = t.lemma
self.pos = t.pos
self.tag = t.tag self.tag = t.tag
self.dep = t.dep self.dep = t.dep
self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec) self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
@ -248,6 +254,24 @@ cdef class Token:
""" """
return self._seq.data[self.i].lex.length return self._seq.data[self.i].lex.length
def nbor(self, int i=1):
return Token(self._seq, self.i + i)
def child(self, int i=1):
cdef const TokenC* t = &self._seq.data[self.i]
if i == 0:
return self
elif i >= 1:
if t.r_kids == 0:
return None
else:
return Token(self._seq, _nth_significant_bit(t.r_kids, i))
else:
if t.l_kids == 0:
return None
else:
return Token(self._seq, _nth_significant_bit(t.l_kids, i))
property head: property head:
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
def __get__(self): def __get__(self):
@ -290,10 +314,26 @@ cdef class Token:
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma] cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
return py_ustr return py_ustr
property pos_:
def __get__(self):
id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
return id_to_string[self.pos]
property tag_: property tag_:
def __get__(self): def __get__(self):
return self._seq.tag_names[self.tag] return self._seq._tag_strings[self.tag]
property dep_: property dep_:
def __get__(self): def __get__(self):
return self._seq.dep_names[self.dep] return self._seq._dep_strings[self.dep]
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
cdef int i
for i in range(32):
if bits & (1 << i):
n -= 1
if n < 1:
return i
return 0

View File

@ -1,3 +1,6 @@
from __future__ import unicode_literals
UNIV_TAG_NAMES = { UNIV_TAG_NAMES = {
"NO_TAG": NO_TAG, "NO_TAG": NO_TAG,
"ADJ": ADJ, "ADJ": ADJ,