Improve API for doc.merge() and span.merge(), to use keyword arguments.

2025-08-01 19:00:20 +03:00 · 2016-10-17 14:02:13 +02:00 · 2016-10-17 14:02:13 +02:00 · b67697a97b
commit b67697a97b
parent fbb7f3f15c
2 changed files with 21 additions and 5 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -593,9 +593,22 @@ cdef class Doc:
                keep_reading = False
            yield n_bytes_str + data

-    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
-              unicode ent_type):
+    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """Merge a multi-word expression into a single token."""
+        cdef unicode tag, lemma, ent_type
+        if len(args) == 3:
+            # TODO: Warn deprecation
+            tag, lemma, ent_type = args
+            attributes[TAG] = self.strings[tag]
+            attributes[LEMMA] = self.strings[lemma]
+            attributes[ENT_TYPE] = self.strings[ent_type]
+        elif args:
+            raise ValueError(
+                "Doc.merge received %d non-keyword arguments. "
+                "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
+                "Arguments supplied:\n%s\n"
+                "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
+ 
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
@ -604,8 +617,11 @@ cdef class Doc:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
-        
        cdef Span span = self[start:end]
+        tag = self.strings[attributes.get(TAG, span.root.tag)]
+        lemma = self.strings[attributes.get(LEMMA, span.root.lemma)]
+        ent_type = self.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
+
        # Get LexemeC for newly merged token
        new_orth = ''.join([t.text_with_ws for t in span])
        if span[-1].whitespace_:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -77,8 +77,8 @@ cdef class Span:
        for i in range(self.start, self.end):
            yield self.doc[i]

-    def merge(self, unicode tag, unicode lemma, unicode ent_type):
-        self.doc.merge(self.start_char, self.end_char, tag, lemma, ent_type)
+    def merge(self, *args, **attributes):
+        self.doc.merge(self.start_char, self.end_char, *args, **attributes)

    def similarity(self, other):
        if 'similarity' in self.doc.getters_for_spans: