spaCy/spacy/tests/lang/en/test_parser.py

from spacy.tokens import Doc


def test_en_parser_noun_chunks_standard(en_vocab):
    words = ["A", "base", "phrase", "should", "be", "recognized", "."]
    heads = [2, 2, 5, 5, 5, 5, 5]
    pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
    deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
    chunks = list(doc.noun_chunks)
    assert len(chunks) == 1
    assert chunks[0].text_with_ws == "A base phrase "


def test_en_parser_noun_chunks_coordinated(en_vocab):
    # fmt: off
    words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]
    heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]
    pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
    deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
    # fmt: on
    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
    chunks = list(doc.noun_chunks)
    assert len(chunks) == 2
    assert chunks[0].text_with_ws == "A base phrase "
    assert chunks[1].text_with_ws == "a good phrase "


def test_en_parser_noun_chunks_pp_chunks(en_vocab):
    words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]
    heads = [1, 5, 1, 4, 2, 5, 5]
    pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
    deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
    chunks = list(doc.noun_chunks)
    assert len(chunks) == 2
    assert chunks[0].text_with_ws == "A phrase "
    assert chunks[1].text_with_ws == "another phrase "


def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):
    # fmt: off
    words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]
    heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]
    pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
    deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
    # fmt: on
    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
    chunks = list(doc.noun_chunks)
    assert len(chunks) == 3
    assert chunks[0].text_with_ws == "Sam "
    assert chunks[1].text_with_ws == "my brother "
    assert chunks[2].text_with_ws == "the house "


def test_en_parser_noun_chunks_dative(en_vocab):
    words = ["She", "gave", "Bob", "a", "raise", "."]
    heads = [1, 1, 1, 4, 1, 1]
    pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
    deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
    chunks = list(doc.noun_chunks)
    assert len(chunks) == 3
    assert chunks[0].text_with_ws == "She "
    assert chunks[1].text_with_ws == "Bob "
    assert chunks[2].text_with_ws == "a raise "
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`from spacy.tokens import Doc`
Modernise noun chunks tests and don't depend on models 2017-01-13 04:01:00 +03:00

Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`def test_en_parser_noun_chunks_standard(en_vocab):`
			`words = ["A", "base", "phrase", "should", "be", "recognized", "."]`
			`heads = [2, 2, 5, 5, 5, 5, 5]`
Remove corpus-specific tag maps Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS. 2020-07-15 15:13:58 +03:00			`pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)`
Modernise noun chunks tests and don't depend on models 2017-01-13 04:01:00 +03:00			`chunks = list(doc.noun_chunks)`
			`assert len(chunks) == 1`
			`assert chunks[0].text_with_ws == "A base phrase "`


Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`def test_en_parser_noun_chunks_coordinated(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`# fmt: off`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]`
			`heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]`
Remove corpus-specific tag maps Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS. 2020-07-15 15:13:58 +03:00			`pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]`
			`# fmt: on`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)`
Modernise noun chunks tests and don't depend on models 2017-01-13 04:01:00 +03:00			`chunks = list(doc.noun_chunks)`
			`assert len(chunks) == 2`
			`assert chunks[0].text_with_ws == "A base phrase "`
			`assert chunks[1].text_with_ws == "a good phrase "`


Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`def test_en_parser_noun_chunks_pp_chunks(en_vocab):`
			`words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]`
			`heads = [1, 5, 1, 4, 2, 5, 5]`
Remove corpus-specific tag maps Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS. 2020-07-15 15:13:58 +03:00			`pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)`
Modernise noun chunks tests and don't depend on models 2017-01-13 04:01:00 +03:00			`chunks = list(doc.noun_chunks)`
			`assert len(chunks) == 2`
			`assert chunks[0].text_with_ws == "A phrase "`
			`assert chunks[1].text_with_ws == "another phrase "`
Port over changes from #1287 2017-10-14 14:16:21 +03:00

Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`# fmt: off`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]`
			`heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]`
Remove corpus-specific tag maps Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS. 2020-07-15 15:13:58 +03:00			`pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]`
			`# fmt: on`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)`
Port over changes from #1287 2017-10-14 14:16:21 +03:00			`chunks = list(doc.noun_chunks)`
			`assert len(chunks) == 3`
			`assert chunks[0].text_with_ws == "Sam "`
			`assert chunks[1].text_with_ws == "my brother "`
			`assert chunks[2].text_with_ws == "the house "`


Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`def test_en_parser_noun_chunks_dative(en_vocab):`
			`words = ["She", "gave", "Bob", "a", "raise", "."]`
			`heads = [1, 1, 1, 4, 1, 1]`
Remove corpus-specific tag maps Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS. 2020-07-15 15:13:58 +03:00			`pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]`
Tidy up tests and docs 2020-09-21 21:43:54 +03:00			`doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)`
Port over changes from #1287 2017-10-14 14:16:21 +03:00			`chunks = list(doc.noun_chunks)`
			`assert len(chunks) == 3`
			`assert chunks[0].text_with_ws == "She "`
			`assert chunks[1].text_with_ws == "Bob "`
			`assert chunks[2].text_with_ws == "a raise "`