mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
58bdd8607b
* Edited Slovenian stop words list (#9707) * Noun chunks for Italian (#9662) * added it vocab * copied portuguese * added possessive determiner * added conjed Nps * added nmoded Nps * test misc * more examples * fixed typo * fixed parenth * fixed comma * comma fix * added syntax iters * fix some index problems * fixed index * corrected heads for test case * fixed tets case * fixed determiner gender * cleaned left over * added example with apostophe * French NP review (#9667) * adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix * Add Japanese kana characters to default exceptions (fix #9693) (#9742) This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension * Remove NER words from stop words in Norwegian (#9820) Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations. Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data. See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831 * Bump sudachipy version * Update sudachipy versions * Bump versions Bumping to the most recent dictionary just to keep thing current. Bumping sudachipy to 5.2 because older versions don't support recent dictionaries. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Richard Hudson <richard@explosion.ai> Co-authored-by: Duygu Altinok <duygu@explosion.ai> Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
231 lines
8.2 KiB
Python
231 lines
8.2 KiB
Python
from spacy.tokens import Doc
|
||
import pytest
|
||
|
||
|
||
# fmt: off
|
||
@pytest.mark.parametrize(
|
||
"words,heads,deps,pos,chunk_offsets",
|
||
[
|
||
# determiner + noun
|
||
# un nom -> un nom
|
||
(
|
||
["un", "nom"],
|
||
[1, 1],
|
||
["det", "ROOT"],
|
||
["DET", "NOUN"],
|
||
[(0, 2)],
|
||
),
|
||
# determiner + noun starting with vowel
|
||
# l'heure -> l'heure
|
||
(
|
||
["l'", "heure"],
|
||
[1, 1],
|
||
["det", "ROOT"],
|
||
["DET", "NOUN"],
|
||
[(0, 2)],
|
||
),
|
||
# determiner + plural noun
|
||
# les romans -> les romans
|
||
(
|
||
["les", "romans"],
|
||
[1, 1],
|
||
["det", "ROOT"],
|
||
["DET", "NOUN"],
|
||
[(0, 2)],
|
||
),
|
||
# det + adj + noun
|
||
# Le vieux Londres -> Le vieux Londres
|
||
(
|
||
['Les', 'vieux', 'Londres'],
|
||
[2, 2, 2],
|
||
["det", "amod", "ROOT"],
|
||
["DET", "ADJ", "NOUN"],
|
||
[(0,3)]
|
||
),
|
||
# det + noun + adj
|
||
# le nom propre -> le nom propre a proper noun
|
||
(
|
||
["le", "nom", "propre"],
|
||
[1, 1, 1],
|
||
["det", "ROOT", "amod"],
|
||
["DET", "NOUN", "ADJ"],
|
||
[(0, 3)],
|
||
),
|
||
# det + noun + adj plural
|
||
# Les chiens bruns -> les chiens bruns
|
||
(
|
||
["Les", "chiens", "bruns"],
|
||
[1, 1, 1],
|
||
["det", "ROOT", "amod"],
|
||
["DET", "NOUN", "ADJ"],
|
||
[(0, 3)],
|
||
),
|
||
# multiple adjectives: one adj before the noun, one adj after the noun
|
||
# un nouveau film intéressant -> un nouveau film intéressant
|
||
(
|
||
["un", "nouveau", "film", "intéressant"],
|
||
[2, 2, 2, 2],
|
||
["det", "amod", "ROOT", "amod"],
|
||
["DET", "ADJ", "NOUN", "ADJ"],
|
||
[(0,4)]
|
||
),
|
||
# multiple adjectives, both adjs after the noun
|
||
# une personne intelligente et drôle -> une personne intelligente et drôle
|
||
(
|
||
["une", "personne", "intelligente", "et", "drôle"],
|
||
[1, 1, 1, 4, 2],
|
||
["det", "ROOT", "amod", "cc", "conj"],
|
||
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||
[(0,5)]
|
||
),
|
||
# relative pronoun
|
||
# un bus qui va au ville -> un bus, qui, ville
|
||
(
|
||
['un', 'bus', 'qui', 'va', 'au', 'ville'],
|
||
[1, 1, 3, 1, 5, 3],
|
||
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
|
||
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
|
||
[(0,2), (2,3), (5,6)]
|
||
),
|
||
# relative subclause
|
||
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
|
||
(
|
||
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
|
||
[0, 2, 0, 5, 5, 2, 5],
|
||
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
|
||
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
|
||
[(1,3), (4,5)]
|
||
),
|
||
# Person name and title by flat
|
||
# Louis XIV -> Louis XIV
|
||
(
|
||
["Louis", "XIV"],
|
||
[0, 0],
|
||
["ROOT", "flat:name"],
|
||
["PROPN", "PROPN"],
|
||
[(0,2)]
|
||
),
|
||
# Organization name by flat
|
||
# Nations Unies -> Nations Unies
|
||
(
|
||
["Nations", "Unies"],
|
||
[0, 0],
|
||
["ROOT", "flat:name"],
|
||
["PROPN", "PROPN"],
|
||
[(0,2)]
|
||
),
|
||
# Noun compound, person name created by two flats
|
||
# Louise de Bratagne -> Louise de Bratagne
|
||
(
|
||
["Louise", "de", "Bratagne"],
|
||
[0, 0, 0],
|
||
["ROOT", "flat:name", "flat:name"],
|
||
["PROPN", "PROPN", "PROPN"],
|
||
[(0,3)]
|
||
),
|
||
# Noun compound, person name created by two flats
|
||
# Louis François Joseph -> Louis François Joseph
|
||
(
|
||
["Louis", "François", "Joseph"],
|
||
[0, 0, 0],
|
||
["ROOT", "flat:name", "flat:name"],
|
||
["PROPN", "PROPN", "PROPN"],
|
||
[(0,3)]
|
||
),
|
||
# one determiner + one noun + one adjective qualified by an adverb
|
||
# quelques agriculteurs très riches -> quelques agriculteurs très riches
|
||
(
|
||
["quelques", "agriculteurs", "très", "riches"],
|
||
[1, 1, 3, 1],
|
||
['det', 'ROOT', 'advmod', 'amod'],
|
||
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||
[(0,4)]
|
||
),
|
||
# Two NPs conjuncted
|
||
# Il a un chien et un chat -> Il, un chien, un chat
|
||
(
|
||
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
|
||
[1, 1, 3, 1, 6, 6, 3],
|
||
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||
[(0,1), (2,4), (5,7)]
|
||
|
||
),
|
||
# Two NPs together
|
||
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
|
||
(
|
||
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
|
||
[1, 1, 1, 1, 3],
|
||
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
|
||
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||
[(0, 3), (3, 5)]
|
||
),
|
||
# nmod relation between NPs
|
||
# la destruction de la ville -> la destruction, la ville
|
||
(
|
||
['la', 'destruction', 'de', 'la', 'ville'],
|
||
[1, 1, 4, 4, 1],
|
||
['det', 'ROOT', 'case', 'det', 'nmod'],
|
||
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
|
||
[(0,2), (3,5)]
|
||
),
|
||
# nmod relation between NPs
|
||
# Archiduchesse d’Autriche -> Archiduchesse, Autriche
|
||
(
|
||
['Archiduchesse', 'd’', 'Autriche'],
|
||
[0, 2, 0],
|
||
['ROOT', 'case', 'nmod'],
|
||
['NOUN', 'ADP', 'PROPN'],
|
||
[(0,1), (2,3)]
|
||
),
|
||
# Compounding by nmod, several NPs chained together
|
||
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
|
||
(
|
||
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
|
||
[2, 2, 2, 4, 2, 6, 2],
|
||
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||
[(0, 3), (4, 5), (6, 7)]
|
||
),
|
||
# several NPs
|
||
# Traduction du rapport de Susana -> Traduction, rapport, Susana
|
||
(
|
||
['Traduction', 'du', 'raport', 'de', 'Susana'],
|
||
[0, 2, 0, 4, 2],
|
||
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||
[(0,1), (2,3), (4,5)]
|
||
|
||
),
|
||
# Several NPs
|
||
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
|
||
(
|
||
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
|
||
[2, 2, 2, 4, 2, 7, 7, 2],
|
||
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
|
||
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
|
||
[(0,3), (4,5), (6,8)]
|
||
),
|
||
# Passive subject
|
||
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
|
||
(
|
||
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
|
||
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
|
||
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
|
||
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
|
||
[(0, 3), (6, 10), (11, 12)]
|
||
)
|
||
],
|
||
)
|
||
# fmt: on
|
||
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
|
||
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||
|
||
|
||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||
doc = fr_tokenizer("Je suis allé à l'école")
|
||
with pytest.raises(ValueError):
|
||
list(doc.noun_chunks)
|