spaCy/spacy/tests/pipeline/test_functions.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.pipeline.functions import merge_subtokens
from ..util import get_doc


@pytest.fixture
def doc(en_tokenizer):
    # fmt: off
    text = "This is a sentence. This is another sentence. And a third."
    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
    deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
            "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
    # fmt: on
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)


def test_merge_subtokens(doc):
    doc = merge_subtokens(doc)
    # get_doc() doesn't set spaces, so the result is "And a third ."
    assert [t.text for t in doc] == [
        "This",
        "is",
        "a sentence",
        ".",
        "This",
        "is",
        "another sentence",
        ".",
        "And a third .",
    ]
Filter subtoken matches in merge_subtokens() (#4539) The `Matcher` in `merge_subtokens()` returns all possible subsequences of `subtok`, so for sequences of two or more subtoks it's necessary to filter the matches so that the retokenizer is only merging the longest matches with no overlapping spans. 2019-10-28 17:40:28 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`
			`from spacy.pipeline.functions import merge_subtokens`
			`from ..util import get_doc`


			`@pytest.fixture`
			`def doc(en_tokenizer):`
			`# fmt: off`
			`text = "This is a sentence. This is another sentence. And a third."`
			`heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]`
			`deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",`
			`"subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]`
			`# fmt: on`
			`tokens = en_tokenizer(text)`
			`return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)`


			`def test_merge_subtokens(doc):`
			`doc = merge_subtokens(doc)`
			`# get_doc() doesn't set spaces, so the result is "And a third ."`
Auto-format [ci skip] 2019-10-30 21:27:18 +03:00			`assert [t.text for t in doc] == [`
			`"This",`
			`"is",`
			`"a sentence",`
			`".",`
			`"This",`
			`"is",`
			`"another sentence",`
			`".",`
			`"And a third .",`
			`]`