* Add merge() method to Tokens, with fairly brittle/hacky implementation, but quite easy to test. Passing minimal tests. Still need to fix left/right deps in C data

This commit is contained in:
Matthew Honnibal 2015-03-30 01:37:41 +02:00
parent 557856e84c
commit e70b87efeb

View File

@ -1,8 +1,10 @@
# cython: embedsignature=True # cython: embedsignature=True
from libc.string cimport memset
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter from preshed.counter cimport PreshCounter
from .strings cimport slice_unicode
from .vocab cimport EMPTY_LEXEME from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA from .typedefs cimport LEMMA
@ -11,6 +13,7 @@ from .typedefs cimport POS, LEMMA
from .parts_of_speech import UNIV_POS_NAMES from .parts_of_speech import UNIV_POS_NAMES
from .lexeme cimport check_flag from .lexeme cimport check_flag
from .spans import Span from .spans import Span
from .structs cimport UniStr
from unidecode import unidecode from unidecode import unidecode
@ -253,6 +256,88 @@ cdef class Tokens:
for i in range(self.length): for i in range(self.length):
self.data[i] = parsed[i] self.data[i] = parsed[i]
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
cdef int i
cdef int start = -1
cdef int end = -1
for i in range(self.length):
if self.data[i].idx == start_idx:
start = i
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
end = i + 1
break
else:
return None
# Get LexemeC for newly merged token
cdef UniStr new_orth_c
slice_unicode(&new_orth_c, self._string, start_idx, end_idx)
cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
# Update fields
token.lex = lex
# What to do about morphology??
# TODO: token.morph = ???
token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma]
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
# Fix dependencies
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
for i in range(self.length):
self.data[i].head += i
# Find the head of the merged token, and its dep relation
outer_heads = {}
for i in range(start, end):
head_idx = self.data[i].head
if head_idx == i or head_idx < start or head_idx >= end:
# Don't consider "heads" which are actually dominated by a word
# in the region we're merging
gp = head_idx
while self.data[gp].head != gp:
if start <= gp < end:
break
gp = self.data[gp].head
else:
# If we have multiple words attaching to the same head,
# but with different dep labels, we're preferring the last
# occurring dep label. Shrug. What else could we do, I guess?
outer_heads[head_idx] = self.data[i].dep
token.head, token.dep = max(outer_heads.items())
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
offset = (end - start) - 1
for i in range(self.length):
head_idx = self.data[i].head
if start <= head_idx < end:
self.data[i].head = start
elif head_idx >= end:
self.data[i].head -= offset
# TODO: Fix left and right deps
# Now compress the token array
for i in range(end, self.length):
self.data[i - offset] = self.data[i]
for i in range(self.length - offset, self.length):
memset(&self.data[i], 0, sizeof(TokenC))
self.data[i].lex = &EMPTY_LEXEME
self.length -= offset
for i in range(self.length):
# ...And, set heads back to a relative position
self.data[i].head -= i
# Clear cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object
return self[start]
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created """An individual token --- i.e. a word, a punctuation symbol, etc. Created