💫 Support lexical attributes in retokenizer attrs (closes #2390) (#3325)

* Fix formatting and whitespace * Add support for lexical attributes (closes #2390) * Document lexical attribute setting during retokenization * Assign variable oputside of nested loop
2025-12-08 02:34:17 +03:00 · 2019-02-24 21:13:51 +01:00 · 2019-02-24 21:13:51 +01:00 · 62b558ab72
commit 62b558ab72
parent a48deb4081
6 changed files with 98 additions and 18 deletions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -89,7 +89,7 @@ cdef class Lexeme:
            return lex.lang
        else:
            return 0
-    
+
    @staticmethod
    cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
        cdef flags_t one = 1
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -289,3 +289,29 @@ def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)
+
+
+def test_doc_retokenizer_merge_lex_attrs(en_vocab):
+    """Test that retokenization also sets attributes on the lexeme if they're
+    lexical attributes. For example, if a user sets IS_STOP, it should mean that
+    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
+    here is acceptable. Also see #2390.
+    """
+    # Test regular merging
+    doc = Doc(en_vocab, words=["hello", "world", "!"])
+    assert not any(t.is_stop for t in doc)
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
+    assert doc[0].lemma_ == "hello world"
+    assert doc[0].is_stop
+    # Test bulk merging
+    doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
+    assert not any(t.like_num for t in doc)
+    assert not any(t.is_stop for t in doc)
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:2], attrs={"like_num": True})
+        retokenizer.merge(doc[2:4], attrs={"is_stop": True})
+    assert doc[0].like_num
+    assert doc[1].is_stop
+    assert not doc[0].is_stop
+    assert not doc[1].like_num
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -165,3 +165,21 @@ def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
+
+
+def test_doc_retokenizer_split_lex_attrs(en_vocab):
+    """Test that retokenization also sets attributes on the lexeme if they're
+    lexical attributes. For example, if a user sets IS_STOP, it should mean that
+    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
+    here is acceptable. Also see #2390.
+    """
+    assert not Doc(en_vocab, words=["Los"])[0].is_stop
+    assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
+    doc = Doc(en_vocab, words=["LosAngeles", "start"])
+    assert not doc[0].is_stop
+    with doc.retokenize() as retokenizer:
+        attrs = {"is_stop": [True, False]}
+        heads = [(doc[0], 1), doc[1]]
+        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
+    assert doc[0].is_stop
+    assert not doc[1].is_stop
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -66,9 +66,13 @@ cdef class Retokenizer:
            for extension in extensions:
                _validate_extensions(extension)
            attrs = {key: value for key, value in attrs.items() if key != "_"}
+            # NB: Since we support {"KEY": [value, value]} syntax here, this
+            # will only "intify" the keys, not the values
            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
            attrs["_"] = extensions
        else:
+            # NB: Since we support {"KEY": [value, value]} syntax here, this
+            # will only "intify" the keys, not the values
            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        head_offsets = []
        for head in heads:
@ -153,7 +157,11 @@ def _merge(Doc doc, int start, int end, attributes):
        elif attr_name == TAG:
            doc.vocab.morphology.assign_tag(token, attr_value)
        else:
+            # Set attributes on both token and lexeme to take care of token
+            # attribute vs. lexical attribute without having to enumerate them.
+            # If an attribute name is not valid, set_struct_attr will ignore it.
            Token.set_struct_attr(token, attr_name, attr_value)
+            Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
    # Make sure ent_iob remains consistent
    if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
        if token.ent_type == doc.c[end].ent_type:
@ -216,6 +224,7 @@ def _bulk_merge(Doc doc, merges):
    """
    cdef Span span
    cdef const LexemeC* lex
+    cdef TokenC* token
    cdef Pool mem = Pool()
    tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
    spans = []
@ -231,15 +240,6 @@ def _bulk_merge(Doc doc, merges):
        # House the new merged token where it starts
        token = &doc.c[start]
        tokens[merge_index] = token
-        # Assign attributes
-        for attr_name, attr_value in attributes.items():
-            if attr_name == "_":  # Set extension attributes
-                for ext_attr_key, ext_attr_value in attr_value.items():
-                    doc[start]._.set(ext_attr_key, ext_attr_value)
-            elif attr_name == TAG:
-                doc.vocab.morphology.assign_tag(token, attr_value)
-            else:
-                Token.set_struct_attr(token, attr_name, attr_value)
    # Resize the doc.tensor, if it's set. Let the last row for each token stand
    # for the merged region. To do this, we create a boolean array indicating
    # whether the row is to be deleted, then use numpy.delete
@ -255,14 +255,30 @@ def _bulk_merge(Doc doc, merges):
    # We update token.lex after keeping span root and dep, since
    # setting token.lex will change span.start and span.end properties
    # as it modifies the character offsets in the doc
-    for token_index in range(len(merges)):
+    for token_index, (span, attributes) in enumerate(merges):
        new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
        if spans[token_index][-1].whitespace_:
            new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
+        token = tokens[token_index]
        lex = doc.vocab.get(doc.mem, new_orth)
-        tokens[token_index].lex = lex
+        token.lex = lex
        # We set trailing space here too
-        tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
+        token.spacy = doc.c[spans[token_index].end-1].spacy
+        py_token = span[0]
+        # Assign attributes
+        for attr_name, attr_value in attributes.items():
+            if attr_name == "_":  # Set extension attributes
+                for ext_attr_key, ext_attr_value in attr_value.items():
+                    py_token._.set(ext_attr_key, ext_attr_value)
+            elif attr_name == TAG:
+                doc.vocab.morphology.assign_tag(token, attr_value)
+            else:
+                # Set attributes on both token and lexeme to take care of token
+                # attribute vs. lexical attribute without having to enumerate
+                # them. If an attribute name is not valid, set_struct_attr will
+                # ignore it.
+                Token.set_struct_attr(token, attr_name, attr_value)
+                Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
    # Begin by setting all the head indices to absolute token positions
    # This is easier to work with for now than the offsets
    # Before thinking of something simpler, beware the case where a
@ -281,7 +297,7 @@ def _bulk_merge(Doc doc, merges):
    current_offset = 0
    for i in range(doc.length):
        if current_span_index < len(spans) and i == spans[current_span_index].end:
-            #last token was the last of the span
+            # Last token was the last of the span
            current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
            current_span_index += 1
        if current_span_index < len(spans) and \
@ -405,10 +421,17 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
            if attr_name == "_":
                for ext_attr_key, ext_attr_value in attr_value.items():
                    doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
+            # NB: We need to call get_string_id here because only the keys are
+            # "intified" (since we support "KEY": [value, value] syntax here).
            elif attr_name == TAG:
                doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
            else:
+                # Set attributes on both token and lexeme to take care of token
+                # attribute vs. lexical attribute without having to enumerate
+                # them. If an attribute name is not valid, set_struct_attr will
+                # ignore it.
                Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
+                Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
    # Assign correct dependencies to the inner token
    for i, head in enumerate(heads):
        doc.c[token_index + i].head = head
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -402,9 +402,11 @@ invalidated, although they may accidentally continue to work.

 ### Retokenizer.merge {#retokenizer.merge tag="method"}

-Mark a span for merging. The `attrs` will be applied to the resulting token.
-Writable custom extension attributes can be provided as a dictionary mapping
-attribute names to values as the `"_"` key.
+Mark a span for merging. The `attrs` will be applied to the resulting token (if
+they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
+underlying lexeme (if they're context-independent lexical attributes like
+`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
+dictionary mapping attribute names to values as the `"_"` key.

 > #### Example
 >
@ -431,7 +433,10 @@ second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other
 newly created subtokens, without having to keep track of the changing token
 indices. If the specified head token will be split within the retokenizer block
 and no subtoken index is specified, it will default to `0`. Attributes to set on
-subtokens can be provided as a list of values.
+subtokens can be provided as a list of values. They'll be applied to the
+resulting token (if they're context-dependent token attributes like `LEMMA` or
+`DEP`) or to the underlying lexeme (if they're context-independent lexical
+attributes like `LOWER` or `IS_STOP`).

 > #### Example
 >
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -995,6 +995,14 @@ with doc.retokenize() as retokenizer:
 print("After:", [token.text for token in doc])
 ```

+If an attribute in the `attrs` is a context-dependent token attribute, it will
+be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS`
+or `DEP` only apply to a word in context, so they're token attributes. If an
+attribute is a context-independent lexical attribute, it will be applied to the
+underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example,
+`LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the
+context.
+
 <Infobox title="Tip: merging entities and noun phrases">

 If you need to merge named entities or noun chunks, check out the built-in