mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-20 21:40:35 +03:00
* Edited Slovenian stop words list (#9707) * Noun chunks for Italian (#9662) * added it vocab * copied portuguese * added possessive determiner * added conjed Nps * added nmoded Nps * test misc * more examples * fixed typo * fixed parenth * fixed comma * comma fix * added syntax iters * fix some index problems * fixed index * corrected heads for test case * fixed tets case * fixed determiner gender * cleaned left over * added example with apostophe * French NP review (#9667) * adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix * Add Japanese kana characters to default exceptions (fix #9693) (#9742) This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension * Remove NER words from stop words in Norwegian (#9820) Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations. Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data. See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831 * Bump sudachipy version * Update sudachipy versions * Bump versions Bumping to the most recent dictionary just to keep thing current. Bumping sudachipy to 5.2 because older versions don't support recent dictionaries. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Richard Hudson <richard@explosion.ai> Co-authored-by: Duygu Altinok <duygu@explosion.ai> Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
222 lines
8.4 KiB
Python
222 lines
8.4 KiB
Python
from spacy.tokens import Doc
|
|
import pytest
|
|
|
|
|
|
# fmt: off
|
|
@pytest.mark.parametrize(
|
|
"words,heads,deps,pos,chunk_offsets",
|
|
[
|
|
# determiner + noun
|
|
# un pollo -> un pollo
|
|
(
|
|
["un", "pollo"],
|
|
[1, 1],
|
|
["det", "ROOT"],
|
|
["DET", "NOUN"],
|
|
[(0,2)],
|
|
),
|
|
# two determiners + noun
|
|
# il mio cane -> il mio cane
|
|
(
|
|
["il", "mio", "cane"],
|
|
[2, 2, 2],
|
|
["det", "det:poss", "ROOT"],
|
|
["DET", "DET", "NOUN"],
|
|
[(0,3)],
|
|
),
|
|
# two determiners, one is after noun. rare usage but still testing
|
|
# il cane mio-> il cane mio
|
|
(
|
|
["il", "cane", "mio"],
|
|
[1, 1, 1],
|
|
["det", "ROOT", "det:poss"],
|
|
["DET", "NOUN", "DET"],
|
|
[(0,3)],
|
|
),
|
|
# relative pronoun
|
|
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
|
|
(
|
|
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
|
|
[2, 2, 2, 4, 2, 7, 7, 4],
|
|
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
|
|
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
|
[(3,5), (5,6)]
|
|
),
|
|
# relative subclause
|
|
# il computer che hai comprato -> il computer, che the computer that you bought
|
|
(
|
|
['il', 'computer', 'che', 'hai', 'comprato'],
|
|
[1, 1, 4, 4, 1],
|
|
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
|
|
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
|
[(0,2), (2,3)]
|
|
),
|
|
# det + noun + adj
|
|
# Una macchina grande -> Una macchina grande
|
|
(
|
|
["Una", "macchina", "grande"],
|
|
[1, 1, 1],
|
|
["det", "ROOT", "amod"],
|
|
["DET", "NOUN", "ADJ"],
|
|
[(0,3)],
|
|
),
|
|
# noun + adj plural
|
|
# mucche bianche
|
|
(
|
|
["mucche", "bianche"],
|
|
[0, 0],
|
|
["ROOT", "amod"],
|
|
["NOUN", "ADJ"],
|
|
[(0,2)],
|
|
),
|
|
# det + adj + noun
|
|
# Una grande macchina -> Una grande macchina
|
|
(
|
|
['Una', 'grande', 'macchina'],
|
|
[2, 2, 2],
|
|
["det", "amod", "ROOT"],
|
|
["DET", "ADJ", "NOUN"],
|
|
[(0,3)]
|
|
),
|
|
# det + adj + noun, det with apostrophe
|
|
# un'importante associazione -> un'importante associazione
|
|
(
|
|
["Un'", 'importante', 'associazione'],
|
|
[2, 2, 2],
|
|
["det", "amod", "ROOT"],
|
|
["DET", "ADJ", "NOUN"],
|
|
[(0,3)]
|
|
),
|
|
# multiple adjectives
|
|
# Un cane piccolo e marrone -> Un cane piccolo e marrone
|
|
(
|
|
["Un", "cane", "piccolo", "e", "marrone"],
|
|
[1, 1, 1, 4, 2],
|
|
["det", "ROOT", "amod", "cc", "conj"],
|
|
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
|
[(0,5)]
|
|
),
|
|
# determiner, adjective, compound created by flat
|
|
# le Nazioni Unite -> le Nazioni Unite
|
|
(
|
|
["le", "Nazioni", "Unite"],
|
|
[1, 1, 1],
|
|
["det", "ROOT", "flat:name"],
|
|
["DET", "PROPN", "PROPN"],
|
|
[(0,3)]
|
|
),
|
|
# one determiner + one noun + one adjective qualified by an adverb
|
|
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
|
|
(
|
|
['alcuni', 'contadini', 'molto', 'ricchi'],
|
|
[1, 1, 3, 1],
|
|
['det', 'ROOT', 'advmod', 'amod'],
|
|
['DET', 'NOUN', 'ADV', 'ADJ'],
|
|
[(0,4)]
|
|
),
|
|
# Two NPs conjuncted
|
|
# Ho un cane e un gatto -> un cane, un gatto
|
|
(
|
|
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
|
|
[0, 2, 0, 5, 5, 0],
|
|
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
|
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
|
[(1,3), (4,6)]
|
|
|
|
),
|
|
# Two NPs together
|
|
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
|
|
(
|
|
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
|
|
[1, 1, 1, 1, 3],
|
|
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
|
|
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
|
[(0, 3), (3, 5)]
|
|
),
|
|
# Noun compound, person name and titles
|
|
# Dom Pedro II -> Dom Pedro II
|
|
(
|
|
["Dom", "Pedro", "II"],
|
|
[0, 0, 0],
|
|
["ROOT", "flat:name", "flat:name"],
|
|
["PROPN", "PROPN", "PROPN"],
|
|
[(0,3)]
|
|
),
|
|
# Noun compound created by flat
|
|
# gli Stati Uniti
|
|
(
|
|
["gli", "Stati", "Uniti"],
|
|
[1, 1, 1],
|
|
["det", "ROOT", "flat:name"],
|
|
["DET", "PROPN", "PROPN"],
|
|
[(0,3)]
|
|
),
|
|
# nmod relation between NPs
|
|
# la distruzione della città -> la distruzione, città
|
|
(
|
|
['la', 'distruzione', 'della', 'città'],
|
|
[1, 1, 3, 1],
|
|
['det', 'ROOT', 'case', 'nmod'],
|
|
['DET', 'NOUN', 'ADP', 'NOUN'],
|
|
[(0,2), (3,4)]
|
|
),
|
|
# Compounding by nmod, several NPs chained together
|
|
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
|
|
(
|
|
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
|
|
[2, 2, 2, 4, 2, 6, 2],
|
|
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
|
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
|
[(0, 3), (4, 5), (6, 7)]
|
|
),
|
|
# several NPs
|
|
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
|
|
(
|
|
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
|
|
[0, 2, 0, 4, 2],
|
|
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
|
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
|
[(0,1), (2,3), (4,5)]
|
|
|
|
),
|
|
# Several NPs
|
|
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
|
|
(
|
|
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
|
|
[1, 1, 1, 4, 1, 8, 8, 8, 1],
|
|
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
|
|
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
|
|
[(0,3), (4,5), (6,9)]
|
|
),
|
|
# Passive subject
|
|
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
|
|
(
|
|
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
|
|
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
|
|
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
|
|
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
|
[(0, 3), (6, 8), (9, 10), (11,12)]
|
|
),
|
|
# Misc
|
|
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
|
|
(
|
|
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
|
|
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
|
|
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
|
|
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
|
[(2,4), (9,12), (13,14), (17,18), (19,20)]
|
|
)
|
|
],
|
|
)
|
|
# fmt: on
|
|
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
|
|
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
|
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
|
|
|
|
|
def test_noun_chunks_is_parsed_it(it_tokenizer):
|
|
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
|
|
doc = it_tokenizer("Sei andato a Oxford")
|
|
with pytest.raises(ValueError):
|
|
list(doc.noun_chunks)
|