Improve attribute handlign in doc.merge(). Still unsatisfying

This commit is contained in:
Matthew Honnibal 2017-03-31 13:59:58 +02:00
parent cfff4e0f61
commit 9720103428

View File

@ -668,12 +668,15 @@ cdef class Doc:
attributes[LEMMA] = self.vocab.strings[lemma] attributes[LEMMA] = self.vocab.strings[lemma]
attributes[ENT_TYPE] = self.vocab.strings[ent_type] attributes[ENT_TYPE] = self.vocab.strings[ent_type]
elif not args: elif not args:
if "label" in attributes and ENT_TYPE not in attributes: # TODO: This code makes little sense overall. We're still
# ignoring most of the attributes?
if "label" in attributes and 'ent_type' not in attributes:
if type(attributes["label"]) == int: if type(attributes["label"]) == int:
attributes[ENT_TYPE] = attributes["label"] attributes[ENT_TYPE] = attributes["label"]
else: else:
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]] attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
if 'ent_type' in attributes:
attributes[ENT_TYPE] = attributes['ent_type']
elif args: elif args:
raise ValueError( raise ValueError(
"Doc.merge received %d non-keyword arguments. " "Doc.merge received %d non-keyword arguments. "
@ -693,6 +696,9 @@ cdef class Doc:
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)] tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)] lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)] ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
ent_id = attributes.get('ent_id', span.root.ent_id)
if not isinstance(ent_id, int):
ent_id = self.vocab.strings[ent_id]
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span]) new_orth = ''.join([t.text_with_ws for t in span])
@ -713,6 +719,7 @@ cdef class Doc:
else: else:
token.ent_iob = 3 token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type] token.ent_type = self.vocab.strings[ent_type]
token.ent_id = ent_id
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets # This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency # Before thinking of something simpler, beware the case where a dependency