mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	French NP review (#9667)
* adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix
This commit is contained in:
		
							parent
							
								
									25bd9f9d48
								
							
						
					
					
						commit
						29f28d1f3e
					
				| 
						 | 
					@ -6,16 +6,35 @@ from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
				
			||||||
    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
					    """
 | 
				
			||||||
    # fmt: off
 | 
					    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
				
			||||||
    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
					    """
 | 
				
			||||||
    # fmt: on
 | 
					    labels = [
 | 
				
			||||||
 | 
					        "nsubj",
 | 
				
			||||||
 | 
					        "nsubj:pass",
 | 
				
			||||||
 | 
					        "obj",
 | 
				
			||||||
 | 
					        "obl",
 | 
				
			||||||
 | 
					        "obl:agent",
 | 
				
			||||||
 | 
					        "obl:arg",
 | 
				
			||||||
 | 
					        "obl:mod",
 | 
				
			||||||
 | 
					        "nmod",
 | 
				
			||||||
 | 
					        "pcomp",
 | 
				
			||||||
 | 
					        "appos",
 | 
				
			||||||
 | 
					        "ROOT",
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
    if not doc.has_annotation("DEP"):
 | 
					    if not doc.has_annotation("DEP"):
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = {doc.vocab.strings.add(label) for label in labels}
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
 | 
					    adj_label = doc.vocab.strings.add("amod")
 | 
				
			||||||
 | 
					    det_label = doc.vocab.strings.add("det")
 | 
				
			||||||
 | 
					    det_pos = doc.vocab.strings.add("DET")
 | 
				
			||||||
 | 
					    adp_pos = doc.vocab.strings.add("ADP")
 | 
				
			||||||
 | 
					    conj_label = doc.vocab.strings.add("conj")
 | 
				
			||||||
 | 
					    conj_pos = doc.vocab.strings.add("CCONJ")
 | 
				
			||||||
    prev_end = -1
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
| 
						 | 
					@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
				
			||||||
        if word.left_edge.i <= prev_end:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            prev_end = word.right_edge.i
 | 
					            right_childs = list(word.rights)
 | 
				
			||||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					            right_child = right_childs[0] if right_childs else None
 | 
				
			||||||
        elif word.dep == conj:
 | 
					
 | 
				
			||||||
 | 
					            if right_child:
 | 
				
			||||||
 | 
					                if (
 | 
				
			||||||
 | 
					                    right_child.dep == adj_label
 | 
				
			||||||
 | 
					                ):  # allow chain of adjectives by expanding to right
 | 
				
			||||||
 | 
					                    right_end = right_child.right_edge
 | 
				
			||||||
 | 
					                elif (
 | 
				
			||||||
 | 
					                    right_child.dep == det_label and right_child.pos == det_pos
 | 
				
			||||||
 | 
					                ):  # cut relative pronouns here
 | 
				
			||||||
 | 
					                    right_end = right_child
 | 
				
			||||||
 | 
					                elif right_child.dep in np_modifs:  # Check if we can expand to right
 | 
				
			||||||
 | 
					                    right_end = word.right_edge
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    right_end = word
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                right_end = word
 | 
				
			||||||
 | 
					            prev_end = right_end.i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            left_index = word.left_edge.i
 | 
				
			||||||
 | 
					            left_index = (
 | 
				
			||||||
 | 
					                left_index + 1 if word.left_edge.pos == adp_pos else left_index
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            yield left_index, right_end.i + 1, np_label
 | 
				
			||||||
 | 
					        elif word.dep == conj_label:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
            while head.dep == conj and head.head.i < head.i:
 | 
					            while head.dep == conj_label and head.head.i < head.i:
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                prev_end = word.right_edge.i
 | 
					                prev_end = word.i
 | 
				
			||||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					
 | 
				
			||||||
 | 
					                left_index = word.left_edge.i  # eliminate left attached conjunction
 | 
				
			||||||
 | 
					                left_index = (
 | 
				
			||||||
 | 
					                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                yield left_index, word.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
 | 
					SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -145,6 +145,11 @@ def fr_tokenizer():
 | 
				
			||||||
    return get_lang_class("fr")().tokenizer
 | 
					    return get_lang_class("fr")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
 | 
					def fr_vocab():
 | 
				
			||||||
 | 
					    return get_lang_class("fr")().vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def ga_tokenizer():
 | 
					def ga_tokenizer():
 | 
				
			||||||
    return get_lang_class("ga")().tokenizer
 | 
					    return get_lang_class("ga")().tokenizer
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,230 @@
 | 
				
			||||||
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# fmt: off
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "words,heads,deps,pos,chunk_offsets",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        # determiner + noun
 | 
				
			||||||
 | 
					        # un nom -> un nom
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["un", "nom"],
 | 
				
			||||||
 | 
					            [1, 1],
 | 
				
			||||||
 | 
					            ["det", "ROOT"],
 | 
				
			||||||
 | 
					            ["DET", "NOUN"],
 | 
				
			||||||
 | 
					            [(0, 2)],
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # determiner + noun starting with vowel
 | 
				
			||||||
 | 
					        # l'heure -> l'heure
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["l'", "heure"],
 | 
				
			||||||
 | 
					            [1, 1],
 | 
				
			||||||
 | 
					            ["det", "ROOT"],
 | 
				
			||||||
 | 
					            ["DET", "NOUN"],
 | 
				
			||||||
 | 
					            [(0, 2)],
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # determiner + plural noun
 | 
				
			||||||
 | 
					        # les romans -> les romans
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["les", "romans"],
 | 
				
			||||||
 | 
					            [1, 1],
 | 
				
			||||||
 | 
					            ["det", "ROOT"],
 | 
				
			||||||
 | 
					            ["DET", "NOUN"],
 | 
				
			||||||
 | 
					            [(0, 2)],
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # det + adj + noun
 | 
				
			||||||
 | 
					        # Le vieux Londres  -> Le vieux Londres 
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['Les', 'vieux', 'Londres'],
 | 
				
			||||||
 | 
					            [2, 2, 2],
 | 
				
			||||||
 | 
					            ["det", "amod", "ROOT"],
 | 
				
			||||||
 | 
					            ["DET", "ADJ", "NOUN"],
 | 
				
			||||||
 | 
					            [(0,3)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # det + noun + adj
 | 
				
			||||||
 | 
					        # le nom propre  -> le nom propre   a proper noun
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["le", "nom", "propre"],
 | 
				
			||||||
 | 
					            [1, 1, 1],
 | 
				
			||||||
 | 
					            ["det", "ROOT", "amod"],
 | 
				
			||||||
 | 
					            ["DET", "NOUN", "ADJ"],
 | 
				
			||||||
 | 
					            [(0, 3)],
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # det + noun + adj plural
 | 
				
			||||||
 | 
					        # Les chiens bruns  -> les chiens bruns
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["Les", "chiens", "bruns"],
 | 
				
			||||||
 | 
					            [1, 1, 1],
 | 
				
			||||||
 | 
					            ["det", "ROOT", "amod"],
 | 
				
			||||||
 | 
					            ["DET", "NOUN", "ADJ"],
 | 
				
			||||||
 | 
					            [(0, 3)],
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # multiple adjectives: one adj before the noun, one adj after the noun
 | 
				
			||||||
 | 
					        # un nouveau film intéressant -> un nouveau film intéressant
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["un", "nouveau", "film", "intéressant"],
 | 
				
			||||||
 | 
					            [2, 2, 2, 2],
 | 
				
			||||||
 | 
					            ["det", "amod", "ROOT", "amod"],
 | 
				
			||||||
 | 
					            ["DET", "ADJ", "NOUN", "ADJ"],
 | 
				
			||||||
 | 
					            [(0,4)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # multiple adjectives, both adjs after the noun
 | 
				
			||||||
 | 
					        # une personne intelligente et drôle -> une personne intelligente et drôle
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["une", "personne", "intelligente", "et", "drôle"],
 | 
				
			||||||
 | 
					            [1, 1, 1, 4, 2],
 | 
				
			||||||
 | 
					            ["det", "ROOT", "amod", "cc", "conj"],
 | 
				
			||||||
 | 
					            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
 | 
				
			||||||
 | 
					            [(0,5)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # relative pronoun
 | 
				
			||||||
 | 
					        # un bus qui va au ville -> un bus, qui, ville
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
 | 
				
			||||||
 | 
					            [1, 1, 3, 1, 5, 3],
 | 
				
			||||||
 | 
					            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
 | 
				
			||||||
 | 
					            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
 | 
				
			||||||
 | 
					            [(0,2), (2,3), (5,6)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # relative subclause
 | 
				
			||||||
 | 
					        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
 | 
				
			||||||
 | 
					            [0, 2, 0, 5, 5, 2, 5],
 | 
				
			||||||
 | 
					            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
 | 
				
			||||||
 | 
					            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
 | 
				
			||||||
 | 
					            [(1,3), (4,5)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Person name and title by flat
 | 
				
			||||||
 | 
					        # Louis XIV -> Louis XIV
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["Louis", "XIV"],
 | 
				
			||||||
 | 
					            [0, 0],
 | 
				
			||||||
 | 
					            ["ROOT", "flat:name"],
 | 
				
			||||||
 | 
					            ["PROPN", "PROPN"],
 | 
				
			||||||
 | 
					            [(0,2)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Organization name by flat
 | 
				
			||||||
 | 
					        # Nations Unies -> Nations Unies
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["Nations", "Unies"],
 | 
				
			||||||
 | 
					            [0, 0],
 | 
				
			||||||
 | 
					            ["ROOT", "flat:name"],
 | 
				
			||||||
 | 
					            ["PROPN", "PROPN"],
 | 
				
			||||||
 | 
					            [(0,2)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Noun compound, person name created by two flats
 | 
				
			||||||
 | 
					        # Louise de Bratagne -> Louise de Bratagne
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["Louise", "de", "Bratagne"],
 | 
				
			||||||
 | 
					            [0, 0, 0],
 | 
				
			||||||
 | 
					            ["ROOT", "flat:name", "flat:name"],
 | 
				
			||||||
 | 
					            ["PROPN", "PROPN", "PROPN"],
 | 
				
			||||||
 | 
					            [(0,3)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Noun compound, person name created by two flats
 | 
				
			||||||
 | 
					        # Louis François Joseph -> Louis François Joseph
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["Louis", "François", "Joseph"],
 | 
				
			||||||
 | 
					            [0, 0, 0],
 | 
				
			||||||
 | 
					            ["ROOT", "flat:name", "flat:name"],
 | 
				
			||||||
 | 
					            ["PROPN", "PROPN", "PROPN"],
 | 
				
			||||||
 | 
					            [(0,3)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # one determiner + one noun + one adjective qualified by an adverb
 | 
				
			||||||
 | 
					        # quelques agriculteurs très riches -> quelques agriculteurs très riches
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["quelques", "agriculteurs", "très", "riches"],
 | 
				
			||||||
 | 
					            [1, 1, 3, 1],
 | 
				
			||||||
 | 
					            ['det', 'ROOT', 'advmod', 'amod'],
 | 
				
			||||||
 | 
					            ['DET', 'NOUN', 'ADV', 'ADJ'],
 | 
				
			||||||
 | 
					            [(0,4)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Two NPs conjuncted
 | 
				
			||||||
 | 
					        # Il a un chien et un chat -> Il, un chien, un chat
 | 
				
			||||||
 | 
					        ( 
 | 
				
			||||||
 | 
					            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
 | 
				
			||||||
 | 
					            [1, 1, 3, 1, 6, 6, 3],
 | 
				
			||||||
 | 
					            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
 | 
				
			||||||
 | 
					            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
 | 
				
			||||||
 | 
					            [(0,1), (2,4), (5,7)]
 | 
				
			||||||
 | 
					         
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Two NPs together
 | 
				
			||||||
 | 
					        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
 | 
				
			||||||
 | 
					            [1, 1, 1, 1, 3],
 | 
				
			||||||
 | 
					            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
 | 
				
			||||||
 | 
					            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
 | 
				
			||||||
 | 
					            [(0, 3), (3, 5)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # nmod relation between NPs
 | 
				
			||||||
 | 
					        # la destruction de la ville -> la destruction, la ville
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['la', 'destruction', 'de', 'la', 'ville'],
 | 
				
			||||||
 | 
					            [1, 1, 4, 4, 1],
 | 
				
			||||||
 | 
					            ['det', 'ROOT', 'case', 'det', 'nmod'],
 | 
				
			||||||
 | 
					            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
 | 
				
			||||||
 | 
					            [(0,2), (3,5)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # nmod relation between NPs
 | 
				
			||||||
 | 
					        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['Archiduchesse', 'd’', 'Autriche'],
 | 
				
			||||||
 | 
					            [0, 2, 0],
 | 
				
			||||||
 | 
					            ['ROOT', 'case', 'nmod'],
 | 
				
			||||||
 | 
					            ['NOUN', 'ADP', 'PROPN'],
 | 
				
			||||||
 | 
					            [(0,1), (2,3)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Compounding by nmod, several NPs chained together
 | 
				
			||||||
 | 
					        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
 | 
				
			||||||
 | 
					            [2, 2, 2, 4, 2, 6, 2],
 | 
				
			||||||
 | 
					            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
 | 
				
			||||||
 | 
					            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
 | 
				
			||||||
 | 
					            [(0, 3), (4, 5), (6, 7)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # several NPs
 | 
				
			||||||
 | 
					        # Traduction du rapport de Susana -> Traduction, rapport, Susana
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['Traduction', 'du', 'raport', 'de', 'Susana'],
 | 
				
			||||||
 | 
					            [0, 2, 0, 4, 2],
 | 
				
			||||||
 | 
					            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
 | 
				
			||||||
 | 
					            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
 | 
				
			||||||
 | 
					            [(0,1), (2,3), (4,5)]  
 | 
				
			||||||
 | 
					       
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Several NPs
 | 
				
			||||||
 | 
					        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
 | 
				
			||||||
 | 
					        (  
 | 
				
			||||||
 | 
					            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
 | 
				
			||||||
 | 
					            [2, 2, 2, 4, 2, 7, 7, 2],
 | 
				
			||||||
 | 
					            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
 | 
				
			||||||
 | 
					            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
 | 
				
			||||||
 | 
					            [(0,3), (4,5), (6,8)]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        # Passive subject
 | 
				
			||||||
 | 
					        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
 | 
				
			||||||
 | 
					            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
 | 
				
			||||||
 | 
					            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
 | 
				
			||||||
 | 
					            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
 | 
				
			||||||
 | 
					            [(0, 3), (6, 10), (11, 12)]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					# fmt: on
 | 
				
			||||||
 | 
					def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
 | 
				
			||||||
 | 
					    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
 | 
				
			||||||
 | 
					    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
 | 
					def test_noun_chunks_is_parsed_fr(fr_tokenizer):
 | 
				
			||||||
    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
 | 
					    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
 | 
				
			||||||
    doc = fr_tokenizer("trouver des travaux antérieurs")
 | 
					    doc = fr_tokenizer("Je suis allé à l'école")
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        list(doc.noun_chunks)
 | 
					        list(doc.noun_chunks)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user