Filter subtoken matches in merge_subtokens() (#4539)

The `Matcher` in `merge_subtokens()` returns all possible subsequences
of `subtok`, so for sequences of two or more subtoks it's necessary to
filter the matches so that the retokenizer is only merging the longest
matches with no overlapping spans.
This commit is contained in:
adrianeboyd 2019-10-28 15:40:28 +01:00 committed by Ines Montani
parent d5509e0989
commit f2bfaa1b38
2 changed files with 26 additions and 1 deletions

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from ..language import component
from ..matcher import Matcher
from ..util import filter_spans
@component(
@ -60,7 +61,7 @@ def merge_subtokens(doc, label="subtok"):
merger = Matcher(doc.vocab)
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
spans = filter_spans([doc[start : end + 1] for _, start, end in matches])
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)

View File

@ -0,0 +1,24 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.pipeline.functions import merge_subtokens
from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
"subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
# fmt: on
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_merge_subtokens(doc):
doc = merge_subtokens(doc)
# get_doc() doesn't set spaces, so the result is "And a third ."
assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]