mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-09 16:10:33 +03:00
French NP review (#9667)
* adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix
This commit is contained in:
parent
25bd9f9d48
commit
29f28d1f3e
|
@ -6,16 +6,35 @@ from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""
|
||||||
# fmt: off
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
"""
|
||||||
# fmt: on
|
labels = [
|
||||||
|
"nsubj",
|
||||||
|
"nsubj:pass",
|
||||||
|
"obj",
|
||||||
|
"obl",
|
||||||
|
"obl:agent",
|
||||||
|
"obl:arg",
|
||||||
|
"obl:mod",
|
||||||
|
"nmod",
|
||||||
|
"pcomp",
|
||||||
|
"appos",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||||
conj = doc.vocab.strings.add("conj")
|
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
adj_label = doc.vocab.strings.add("amod")
|
||||||
|
det_label = doc.vocab.strings.add("det")
|
||||||
|
det_pos = doc.vocab.strings.add("DET")
|
||||||
|
adp_pos = doc.vocab.strings.add("ADP")
|
||||||
|
conj_label = doc.vocab.strings.add("conj")
|
||||||
|
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||||
prev_end = -1
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
if word.left_edge.i <= prev_end:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
prev_end = word.right_edge.i
|
right_childs = list(word.rights)
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
right_child = right_childs[0] if right_childs else None
|
||||||
elif word.dep == conj:
|
|
||||||
|
if right_child:
|
||||||
|
if (
|
||||||
|
right_child.dep == adj_label
|
||||||
|
): # allow chain of adjectives by expanding to right
|
||||||
|
right_end = right_child.right_edge
|
||||||
|
elif (
|
||||||
|
right_child.dep == det_label and right_child.pos == det_pos
|
||||||
|
): # cut relative pronouns here
|
||||||
|
right_end = right_child
|
||||||
|
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||||
|
right_end = word.right_edge
|
||||||
|
else:
|
||||||
|
right_end = word
|
||||||
|
else:
|
||||||
|
right_end = word
|
||||||
|
prev_end = right_end.i
|
||||||
|
|
||||||
|
left_index = word.left_edge.i
|
||||||
|
left_index = (
|
||||||
|
left_index + 1 if word.left_edge.pos == adp_pos else left_index
|
||||||
|
)
|
||||||
|
|
||||||
|
yield left_index, right_end.i + 1, np_label
|
||||||
|
elif word.dep == conj_label:
|
||||||
head = word.head
|
head = word.head
|
||||||
while head.dep == conj and head.head.i < head.i:
|
while head.dep == conj_label and head.head.i < head.i:
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
prev_end = word.right_edge.i
|
prev_end = word.i
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
|
||||||
|
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||||
|
left_index = (
|
||||||
|
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||||
|
)
|
||||||
|
yield left_index, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
|
@ -145,6 +145,11 @@ def fr_tokenizer():
|
||||||
return get_lang_class("fr")().tokenizer
|
return get_lang_class("fr")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def fr_vocab():
|
||||||
|
return get_lang_class("fr")().vocab
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ga_tokenizer():
|
def ga_tokenizer():
|
||||||
return get_lang_class("ga")().tokenizer
|
return get_lang_class("ga")().tokenizer
|
||||||
|
|
|
@ -1,8 +1,230 @@
|
||||||
|
from spacy.tokens import Doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"words,heads,deps,pos,chunk_offsets",
|
||||||
|
[
|
||||||
|
# determiner + noun
|
||||||
|
# un nom -> un nom
|
||||||
|
(
|
||||||
|
["un", "nom"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0, 2)],
|
||||||
|
),
|
||||||
|
# determiner + noun starting with vowel
|
||||||
|
# l'heure -> l'heure
|
||||||
|
(
|
||||||
|
["l'", "heure"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0, 2)],
|
||||||
|
),
|
||||||
|
# determiner + plural noun
|
||||||
|
# les romans -> les romans
|
||||||
|
(
|
||||||
|
["les", "romans"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0, 2)],
|
||||||
|
),
|
||||||
|
# det + adj + noun
|
||||||
|
# Le vieux Londres -> Le vieux Londres
|
||||||
|
(
|
||||||
|
['Les', 'vieux', 'Londres'],
|
||||||
|
[2, 2, 2],
|
||||||
|
["det", "amod", "ROOT"],
|
||||||
|
["DET", "ADJ", "NOUN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# det + noun + adj
|
||||||
|
# le nom propre -> le nom propre a proper noun
|
||||||
|
(
|
||||||
|
["le", "nom", "propre"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "amod"],
|
||||||
|
["DET", "NOUN", "ADJ"],
|
||||||
|
[(0, 3)],
|
||||||
|
),
|
||||||
|
# det + noun + adj plural
|
||||||
|
# Les chiens bruns -> les chiens bruns
|
||||||
|
(
|
||||||
|
["Les", "chiens", "bruns"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "amod"],
|
||||||
|
["DET", "NOUN", "ADJ"],
|
||||||
|
[(0, 3)],
|
||||||
|
),
|
||||||
|
# multiple adjectives: one adj before the noun, one adj after the noun
|
||||||
|
# un nouveau film intéressant -> un nouveau film intéressant
|
||||||
|
(
|
||||||
|
["un", "nouveau", "film", "intéressant"],
|
||||||
|
[2, 2, 2, 2],
|
||||||
|
["det", "amod", "ROOT", "amod"],
|
||||||
|
["DET", "ADJ", "NOUN", "ADJ"],
|
||||||
|
[(0,4)]
|
||||||
|
),
|
||||||
|
# multiple adjectives, both adjs after the noun
|
||||||
|
# une personne intelligente et drôle -> une personne intelligente et drôle
|
||||||
|
(
|
||||||
|
["une", "personne", "intelligente", "et", "drôle"],
|
||||||
|
[1, 1, 1, 4, 2],
|
||||||
|
["det", "ROOT", "amod", "cc", "conj"],
|
||||||
|
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||||
|
[(0,5)]
|
||||||
|
),
|
||||||
|
# relative pronoun
|
||||||
|
# un bus qui va au ville -> un bus, qui, ville
|
||||||
|
(
|
||||||
|
['un', 'bus', 'qui', 'va', 'au', 'ville'],
|
||||||
|
[1, 1, 3, 1, 5, 3],
|
||||||
|
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
|
||||||
|
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
|
||||||
|
[(0,2), (2,3), (5,6)]
|
||||||
|
),
|
||||||
|
# relative subclause
|
||||||
|
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
|
||||||
|
(
|
||||||
|
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
|
||||||
|
[0, 2, 0, 5, 5, 2, 5],
|
||||||
|
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
|
||||||
|
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
|
||||||
|
[(1,3), (4,5)]
|
||||||
|
),
|
||||||
|
# Person name and title by flat
|
||||||
|
# Louis XIV -> Louis XIV
|
||||||
|
(
|
||||||
|
["Louis", "XIV"],
|
||||||
|
[0, 0],
|
||||||
|
["ROOT", "flat:name"],
|
||||||
|
["PROPN", "PROPN"],
|
||||||
|
[(0,2)]
|
||||||
|
),
|
||||||
|
# Organization name by flat
|
||||||
|
# Nations Unies -> Nations Unies
|
||||||
|
(
|
||||||
|
["Nations", "Unies"],
|
||||||
|
[0, 0],
|
||||||
|
["ROOT", "flat:name"],
|
||||||
|
["PROPN", "PROPN"],
|
||||||
|
[(0,2)]
|
||||||
|
),
|
||||||
|
# Noun compound, person name created by two flats
|
||||||
|
# Louise de Bratagne -> Louise de Bratagne
|
||||||
|
(
|
||||||
|
["Louise", "de", "Bratagne"],
|
||||||
|
[0, 0, 0],
|
||||||
|
["ROOT", "flat:name", "flat:name"],
|
||||||
|
["PROPN", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# Noun compound, person name created by two flats
|
||||||
|
# Louis François Joseph -> Louis François Joseph
|
||||||
|
(
|
||||||
|
["Louis", "François", "Joseph"],
|
||||||
|
[0, 0, 0],
|
||||||
|
["ROOT", "flat:name", "flat:name"],
|
||||||
|
["PROPN", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# one determiner + one noun + one adjective qualified by an adverb
|
||||||
|
# quelques agriculteurs très riches -> quelques agriculteurs très riches
|
||||||
|
(
|
||||||
|
["quelques", "agriculteurs", "très", "riches"],
|
||||||
|
[1, 1, 3, 1],
|
||||||
|
['det', 'ROOT', 'advmod', 'amod'],
|
||||||
|
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||||
|
[(0,4)]
|
||||||
|
),
|
||||||
|
# Two NPs conjuncted
|
||||||
|
# Il a un chien et un chat -> Il, un chien, un chat
|
||||||
|
(
|
||||||
|
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
|
||||||
|
[1, 1, 3, 1, 6, 6, 3],
|
||||||
|
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||||
|
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||||
|
[(0,1), (2,4), (5,7)]
|
||||||
|
|
||||||
|
),
|
||||||
|
# Two NPs together
|
||||||
|
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
|
||||||
|
(
|
||||||
|
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
|
||||||
|
[1, 1, 1, 1, 3],
|
||||||
|
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
|
||||||
|
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||||
|
[(0, 3), (3, 5)]
|
||||||
|
),
|
||||||
|
# nmod relation between NPs
|
||||||
|
# la destruction de la ville -> la destruction, la ville
|
||||||
|
(
|
||||||
|
['la', 'destruction', 'de', 'la', 'ville'],
|
||||||
|
[1, 1, 4, 4, 1],
|
||||||
|
['det', 'ROOT', 'case', 'det', 'nmod'],
|
||||||
|
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
|
||||||
|
[(0,2), (3,5)]
|
||||||
|
),
|
||||||
|
# nmod relation between NPs
|
||||||
|
# Archiduchesse d’Autriche -> Archiduchesse, Autriche
|
||||||
|
(
|
||||||
|
['Archiduchesse', 'd’', 'Autriche'],
|
||||||
|
[0, 2, 0],
|
||||||
|
['ROOT', 'case', 'nmod'],
|
||||||
|
['NOUN', 'ADP', 'PROPN'],
|
||||||
|
[(0,1), (2,3)]
|
||||||
|
),
|
||||||
|
# Compounding by nmod, several NPs chained together
|
||||||
|
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
|
||||||
|
(
|
||||||
|
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
|
||||||
|
[2, 2, 2, 4, 2, 6, 2],
|
||||||
|
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||||
|
[(0, 3), (4, 5), (6, 7)]
|
||||||
|
),
|
||||||
|
# several NPs
|
||||||
|
# Traduction du rapport de Susana -> Traduction, rapport, Susana
|
||||||
|
(
|
||||||
|
['Traduction', 'du', 'raport', 'de', 'Susana'],
|
||||||
|
[0, 2, 0, 4, 2],
|
||||||
|
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||||
|
[(0,1), (2,3), (4,5)]
|
||||||
|
|
||||||
|
),
|
||||||
|
# Several NPs
|
||||||
|
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
|
||||||
|
(
|
||||||
|
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
|
||||||
|
[2, 2, 2, 4, 2, 7, 7, 2],
|
||||||
|
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
|
||||||
|
[(0,3), (4,5), (6,8)]
|
||||||
|
),
|
||||||
|
# Passive subject
|
||||||
|
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
|
||||||
|
(
|
||||||
|
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
|
||||||
|
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
|
||||||
|
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
|
||||||
|
[(0, 3), (6, 10), (11, 12)]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
|
||||||
|
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||||
|
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
doc = fr_tokenizer("Je suis allé à l'école")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user