Improve attribute handlign in doc.merge(). Still unsatisfying

2025-08-02 03:10:22 +03:00 · 2017-03-31 13:59:58 +02:00 · 2017-03-31 13:59:58 +02:00 · 9720103428
commit 9720103428
parent cfff4e0f61
1 changed files with 9 additions and 2 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -668,12 +668,15 @@ cdef class Doc:
            attributes[LEMMA] = self.vocab.strings[lemma]
            attributes[ENT_TYPE] = self.vocab.strings[ent_type]
        elif not args:
-            if "label" in attributes and ENT_TYPE not in attributes:
+            # TODO: This code makes little sense overall. We're still
+            # ignoring most of the attributes?
+            if "label" in attributes and 'ent_type' not in attributes:
                if type(attributes["label"]) == int:
                    attributes[ENT_TYPE] = attributes["label"]
                else:
                    attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
-
+            if 'ent_type' in attributes:
+                attributes[ENT_TYPE] = attributes['ent_type']
        elif args:
            raise ValueError(
                "Doc.merge received %d non-keyword arguments. "
@ -693,6 +696,9 @@ cdef class Doc:
        tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
        lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
        ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
+        ent_id = attributes.get('ent_id', span.root.ent_id)
+        if not isinstance(ent_id, int):
+            ent_id = self.vocab.strings[ent_id]

        # Get LexemeC for newly merged token
        new_orth = ''.join([t.text_with_ws for t in span])
@ -713,6 +719,7 @@ cdef class Doc:
        else:
            token.ent_iob = 3
            token.ent_type = self.vocab.strings[ent_type]
+        token.ent_id = ent_id
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
        # Before thinking of something simpler, beware the case where a dependency