diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 9562dcbdb..69e638da2 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ..language import component from ..matcher import Matcher +from ..util import filter_spans @component( @@ -60,7 +61,7 @@ def merge_subtokens(doc, label="subtok"): merger = Matcher(doc.vocab) merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] + spans = filter_spans([doc[start : end + 1] for _, start, end in matches]) with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py new file mode 100644 index 000000000..fbb88ade2 --- /dev/null +++ b/spacy/tests/pipeline/test_functions.py @@ -0,0 +1,24 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.pipeline.functions import merge_subtokens +from ..util import get_doc + + +@pytest.fixture +def doc(en_tokenizer): + # fmt: off + text = "This is a sentence. This is another sentence. And a third." + heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0] + deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT", + "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"] + # fmt: on + tokens = en_tokenizer(text) + return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) + + +def test_merge_subtokens(doc): + doc = merge_subtokens(doc) + # get_doc() doesn't set spaces, so the result is "And a third ." + assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]