spaCy/spacy/tests/lang/fr/test_noun_chunks.py

import pytest

from spacy.tokens import Doc


# fmt: off
@pytest.mark.parametrize(
    "words,heads,deps,pos,chunk_offsets",
    [
        # determiner + noun
        # un nom -> un nom
        (
            ["un", "nom"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # determiner + noun starting with vowel
        # l'heure -> l'heure
        (
            ["l'", "heure"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # determiner + plural noun
        # les romans -> les romans
        (
            ["les", "romans"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # det + adj + noun
        # Le vieux Londres  -> Le vieux Londres 
        (
            ['Les', 'vieux', 'Londres'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # det + noun + adj
        # le nom propre  -> le nom propre   a proper noun
        (
            ["le", "nom", "propre"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # det + noun + adj plural
        # Les chiens bruns  -> les chiens bruns
        (
            ["Les", "chiens", "bruns"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # multiple adjectives: one adj before the noun, one adj after the noun
        # un nouveau film intéressant -> un nouveau film intéressant
        (
            ["un", "nouveau", "film", "intéressant"],
            [2, 2, 2, 2],
            ["det", "amod", "ROOT", "amod"],
            ["DET", "ADJ", "NOUN", "ADJ"],
            [(0,4)]
        ),
        # multiple adjectives, both adjs after the noun
        # une personne intelligente et drôle -> une personne intelligente et drôle
        (
            ["une", "personne", "intelligente", "et", "drôle"],
            [1, 1, 1, 4, 2],
            ["det", "ROOT", "amod", "cc", "conj"],
            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
            [(0,5)]
        ),
        # relative pronoun
        # un bus qui va au ville -> un bus, qui, ville
        (
            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
            [1, 1, 3, 1, 5, 3],
            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
            [(0,2), (2,3), (5,6)]
        ),
        # relative subclause
        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
        (
            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
            [0, 2, 0, 5, 5, 2, 5],
            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
            [(1,3), (4,5)]
        ),
        # Person name and title by flat
        # Louis XIV -> Louis XIV
        (
            ["Louis", "XIV"],
            [0, 0],
            ["ROOT", "flat:name"],
            ["PROPN", "PROPN"],
            [(0,2)]
        ),
        # Organization name by flat
        # Nations Unies -> Nations Unies
        (
            ["Nations", "Unies"],
            [0, 0],
            ["ROOT", "flat:name"],
            ["PROPN", "PROPN"],
            [(0,2)]
        ),
        # Noun compound, person name created by two flats
        # Louise de Bratagne -> Louise de Bratagne
        (
            ["Louise", "de", "Bratagne"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # Noun compound, person name created by two flats
        # Louis François Joseph -> Louis François Joseph
        (
            ["Louis", "François", "Joseph"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # one determiner + one noun + one adjective qualified by an adverb
        # quelques agriculteurs très riches -> quelques agriculteurs très riches
        (
            ["quelques", "agriculteurs", "très", "riches"],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'advmod', 'amod'],
            ['DET', 'NOUN', 'ADV', 'ADJ'],
            [(0,4)]
        ),
        # Two NPs conjuncted
        # Il a un chien et un chat -> Il, un chien, un chat
        ( 
            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
            [1, 1, 3, 1, 6, 6, 3],
            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
            [(0,1), (2,4), (5,7)]
         
        ),
        # Two NPs together
        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
        (
            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
            [1, 1, 1, 1, 3],
            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
            [(0, 3), (3, 5)]
        ),
        # nmod relation between NPs
        # la destruction de la ville -> la destruction, la ville
        (
            ['la', 'destruction', 'de', 'la', 'ville'],
            [1, 1, 4, 4, 1],
            ['det', 'ROOT', 'case', 'det', 'nmod'],
            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
            [(0,2), (3,5)]
        ),
        # nmod relation between NPs
        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
        (
            ['Archiduchesse', 'd’', 'Autriche'],
            [0, 2, 0],
            ['ROOT', 'case', 'nmod'],
            ['NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3)]
        ),
        # Compounding by nmod, several NPs chained together
        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
        (
            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
            [2, 2, 2, 4, 2, 6, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(0, 3), (4, 5), (6, 7)]
        ),
        # several NPs
        # Traduction du rapport de Susana -> Traduction, rapport, Susana
        (
            ['Traduction', 'du', 'raport', 'de', 'Susana'],
            [0, 2, 0, 4, 2],
            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3), (4,5)]  
       
        ),
        # Several NPs
        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
        (  
            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
            [2, 2, 2, 4, 2, 7, 7, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
            [(0,3), (4,5), (6,8)]
        ),
        # Passive subject
        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
        (
            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
            [(0, 3), (6, 10), (11, 12)]
        )
    ],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets


def test_noun_chunks_is_parsed_fr(fr_tokenizer):
    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
    doc = fr_tokenizer("Je suis allé à l'école")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
-												Limiting noun_chunks for specific languages (#5396)

* Limiting noun_chunks for specific langauges

* Limiting noun_chunks for specific languages

Contributor Agreement

* Addressing review comments

* Removed unused fixtures and imports

* Add fa_tokenizer in test suite

* Use fa_tokenizer in test

* Undo extraneous reformatting

Co-authored-by: adrianeboyd <adrianeboyd@gmail.com>
											
										
										
											2020-05-14 13:58:06 +03:00
+								import pytest
-												isort all the things

											
										
										
											2023-06-26 12:41:03 +03:00
+								from spacy.tokens import Doc
-												Limiting noun_chunks for specific languages (#5396)

* Limiting noun_chunks for specific langauges

* Limiting noun_chunks for specific languages

Contributor Agreement

* Addressing review comments

* Removed unused fixtures and imports

* Add fa_tokenizer in test suite

* Use fa_tokenizer in test

* Undo extraneous reformatting

Co-authored-by: adrianeboyd <adrianeboyd@gmail.com>
											
										
										
											2020-05-14 13:58:06 +03:00
-												French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix
											
										
										
											2021-11-30 14:19:07 +03:00
+								# fmt: off
 								@pytest.mark.parametrize(
 								    "words,heads,deps,pos,chunk_offsets",
 								    [
 								        # determiner + noun
 								        # un nom -> un nom
 								        (
 								            ["un", "nom"],
 								            [1, 1],
 								            ["det", "ROOT"],
 								            ["DET", "NOUN"],
 								            [(0, 2)],
 								        ),
 								        # determiner + noun starting with vowel
 								        # l'heure -> l'heure
 								        (
 								            ["l'", "heure"],
 								            [1, 1],
 								            ["det", "ROOT"],
 								            ["DET", "NOUN"],
 								            [(0, 2)],
 								        ),
 								        # determiner + plural noun
 								        # les romans -> les romans
 								        (
 								            ["les", "romans"],
 								            [1, 1],
 								            ["det", "ROOT"],
 								            ["DET", "NOUN"],
 								            [(0, 2)],
 								        ),
 								        # det + adj + noun
 								        # Le vieux Londres  -> Le vieux Londres
 								        (
 								            ['Les', 'vieux', 'Londres'],
 								            [2, 2, 2],
 								            ["det", "amod", "ROOT"],
 								            ["DET", "ADJ", "NOUN"],
 								            [(0,3)]
 								        ),
 								        # det + noun + adj
 								        # le nom propre  -> le nom propre   a proper noun
 								        (
 								            ["le", "nom", "propre"],
 								            [1, 1, 1],
 								            ["det", "ROOT", "amod"],
 								            ["DET", "NOUN", "ADJ"],
 								            [(0, 3)],
 								        ),
 								        # det + noun + adj plural
 								        # Les chiens bruns  -> les chiens bruns
 								        (
 								            ["Les", "chiens", "bruns"],
 								            [1, 1, 1],
 								            ["det", "ROOT", "amod"],
 								            ["DET", "NOUN", "ADJ"],
 								            [(0, 3)],
 								        ),
 								        # multiple adjectives: one adj before the noun, one adj after the noun
 								        # un nouveau film intéressant -> un nouveau film intéressant
 								        (
 								            ["un", "nouveau", "film", "intéressant"],
 								            [2, 2, 2, 2],
 								            ["det", "amod", "ROOT", "amod"],
 								            ["DET", "ADJ", "NOUN", "ADJ"],
 								            [(0,4)]
 								        ),
 								        # multiple adjectives, both adjs after the noun
 								        # une personne intelligente et drôle -> une personne intelligente et drôle
 								        (
 								            ["une", "personne", "intelligente", "et", "drôle"],
 								            [1, 1, 1, 4, 2],
 								            ["det", "ROOT", "amod", "cc", "conj"],
 								            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
 								            [(0,5)]
 								        ),
 								        # relative pronoun
 								        # un bus qui va au ville -> un bus, qui, ville
 								        (
 								            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
 								            [1, 1, 3, 1, 5, 3],
 								            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
 								            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
 								            [(0,2), (2,3), (5,6)]
 								        ),
 								        # relative subclause
 								        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
 								        (
 								            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
 								            [0, 2, 0, 5, 5, 2, 5],
 								            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
 								            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
 								            [(1,3), (4,5)]
 								        ),
 								        # Person name and title by flat
 								        # Louis XIV -> Louis XIV
 								        (
 								            ["Louis", "XIV"],
 								            [0, 0],
 								            ["ROOT", "flat:name"],
 								            ["PROPN", "PROPN"],
 								            [(0,2)]
 								        ),
 								        # Organization name by flat
 								        # Nations Unies -> Nations Unies
 								        (
 								            ["Nations", "Unies"],
 								            [0, 0],
 								            ["ROOT", "flat:name"],
 								            ["PROPN", "PROPN"],
 								            [(0,2)]
 								        ),
 								        # Noun compound, person name created by two flats
 								        # Louise de Bratagne -> Louise de Bratagne
 								        (
 								            ["Louise", "de", "Bratagne"],
 								            [0, 0, 0],
 								            ["ROOT", "flat:name", "flat:name"],
 								            ["PROPN", "PROPN", "PROPN"],
 								            [(0,3)]
 								        ),
 								        # Noun compound, person name created by two flats
 								        # Louis François Joseph -> Louis François Joseph
 								        (
 								            ["Louis", "François", "Joseph"],
 								            [0, 0, 0],
 								            ["ROOT", "flat:name", "flat:name"],
 								            ["PROPN", "PROPN", "PROPN"],
 								            [(0,3)]
 								        ),
 								        # one determiner + one noun + one adjective qualified by an adverb
 								        # quelques agriculteurs très riches -> quelques agriculteurs très riches
 								        (
 								            ["quelques", "agriculteurs", "très", "riches"],
 								            [1, 1, 3, 1],
 								            ['det', 'ROOT', 'advmod', 'amod'],
 								            ['DET', 'NOUN', 'ADV', 'ADJ'],
 								            [(0,4)]
 								        ),
 								        # Two NPs conjuncted
 								        # Il a un chien et un chat -> Il, un chien, un chat
 								        (
 								            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
 								            [1, 1, 3, 1, 6, 6, 3],
 								            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
 								            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
 								            [(0,1), (2,4), (5,7)]
 								        ),
 								        # Two NPs together
 								        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
 								        (
 								            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
 								            [1, 1, 1, 1, 3],
 								            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
 								            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
 								            [(0, 3), (3, 5)]
 								        ),
 								        # nmod relation between NPs
 								        # la destruction de la ville -> la destruction, la ville
 								        (
 								            ['la', 'destruction', 'de', 'la', 'ville'],
 								            [1, 1, 4, 4, 1],
 								            ['det', 'ROOT', 'case', 'det', 'nmod'],
 								            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
 								            [(0,2), (3,5)]
 								        ),
 								        # nmod relation between NPs
 								        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
 								        (
 								            ['Archiduchesse', 'd’', 'Autriche'],
 								            [0, 2, 0],
 								            ['ROOT', 'case', 'nmod'],
 								            ['NOUN', 'ADP', 'PROPN'],
 								            [(0,1), (2,3)]
 								        ),
 								        # Compounding by nmod, several NPs chained together
 								        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
 								        (
 								            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
 								            [2, 2, 2, 4, 2, 6, 2],
 								            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
 								            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
 								            [(0, 3), (4, 5), (6, 7)]
 								        ),
 								        # several NPs
 								        # Traduction du rapport de Susana -> Traduction, rapport, Susana
 								        (
 								            ['Traduction', 'du', 'raport', 'de', 'Susana'],
 								            [0, 2, 0, 4, 2],
 								            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
 								            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
 								            [(0,1), (2,3), (4,5)]
 								        ),
 								        # Several NPs
 								        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
 								        (
 								            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
 								            [2, 2, 2, 4, 2, 7, 7, 2],
 								            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
 								            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
 								            [(0,3), (4,5), (6,8)]
 								        ),
 								        # Passive subject
 								        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
 								        (
 								            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
 								            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
 								            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
 								            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
 								            [(0, 3), (6, 10), (11, 12)]
 								        )
 								    ],
 								)
 								# fmt: on
 								def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
 								    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
 								    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
-												Limiting noun_chunks for specific languages (#5396)

* Limiting noun_chunks for specific langauges

* Limiting noun_chunks for specific languages

Contributor Agreement

* Addressing review comments

* Removed unused fixtures and imports

* Add fa_tokenizer in test suite

* Use fa_tokenizer in test

* Undo extraneous reformatting

Co-authored-by: adrianeboyd <adrianeboyd@gmail.com>
											
										
										
											2020-05-14 13:58:06 +03:00
+								def test_noun_chunks_is_parsed_fr(fr_tokenizer):
-												Tidy up and auto-format

											
										
										
											2020-09-29 22:39:28 +03:00
+								    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-												French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix
											
										
										
											2021-11-30 14:19:07 +03:00
+								    doc = fr_tokenizer("Je suis allé à l'école")
-												Limiting noun_chunks for specific languages (#5396)

* Limiting noun_chunks for specific langauges

* Limiting noun_chunks for specific languages

Contributor Agreement

* Addressing review comments

* Removed unused fixtures and imports

* Add fa_tokenizer in test suite

* Use fa_tokenizer in test

* Undo extraneous reformatting

Co-authored-by: adrianeboyd <adrianeboyd@gmail.com>
											
										
										
											2020-05-14 13:58:06 +03:00
+								    with pytest.raises(ValueError):
 								        list(doc.noun_chunks)