Filter subtoken matches in merge_subtokens() (#4539)

The `Matcher` in `merge_subtokens()` returns all possible subsequences of `subtok`, so for sequences of two or more subtoks it's necessary to filter the matches so that the retokenizer is only merging the longest matches with no overlapping spans.
2025-08-08 22:24:55 +03:00 · 2019-10-28 15:40:28 +01:00 · 2019-10-28 15:40:28 +01:00 · f2bfaa1b38
commit f2bfaa1b38
parent d5509e0989
2 changed files with 26 additions and 1 deletions
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 from ..language import component
 from ..matcher import Matcher
+from ..util import filter_spans


@component(
@ -60,7 +61,7 @@ def merge_subtokens(doc, label="subtok"):
    merger = Matcher(doc.vocab)
    merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
    matches = merger(doc)
-    spans = [doc[start : end + 1] for _, start, end in matches]
+    spans = filter_spans([doc[start : end + 1] for _, start, end in matches])
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@ -0,0 +1,24 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.pipeline.functions import merge_subtokens
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+    # fmt: off
+    text = "This is a sentence. This is another sentence. And a third."
+    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
+    deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
+            "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
+    # fmt: on
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_merge_subtokens(doc):
+    doc = merge_subtokens(doc)
+    # get_doc() doesn't set spaces, so the result is "And a third ."
+    assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]