mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	This reverts commit 58bdd8607b.
			
			
This commit is contained in:
		
							parent
							
								
									6a8619dd73
								
							
						
					
					
						commit
						add52935ff
					
				|  | @ -108,8 +108,8 @@ apple = | |||
|     thinc-apple-ops>=0.0.4,<1.0.0 | ||||
| # Language tokenizers with external dependencies | ||||
| ja = | ||||
|     sudachipy>=0.5.2,!=0.6.1 | ||||
|     sudachidict_core>=20211220 | ||||
|     sudachipy>=0.4.9 | ||||
|     sudachidict_core>=20200330 | ||||
| ko = | ||||
|     natto-py==0.9.0 | ||||
| th = | ||||
|  |  | |||
|  | @ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF" | |||
| _hangul_jamo = r"\u1100-\u11FF" | ||||
| _hangul = _hangul_syllables + _hangul_jamo | ||||
| 
 | ||||
| _hiragana = r"\u3040-\u309F" | ||||
| _katakana = r"\u30A0-\u30FFー" | ||||
| _kana = _hiragana + _katakana | ||||
| 
 | ||||
| # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh | ||||
| _latin_u_extendedA = ( | ||||
|     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" | ||||
|  | @ -248,7 +244,6 @@ _uncased = ( | |||
|     + _tamil | ||||
|     + _telugu | ||||
|     + _hangul | ||||
|     + _kana | ||||
|     + _cjk | ||||
| ) | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,35 +6,16 @@ from ...tokens import Doc, Span | |||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|     """ | ||||
|     Detect base noun phrases from a dependency parse. Works on both Doc and Span. | ||||
|     """ | ||||
|     labels = [ | ||||
|         "nsubj", | ||||
|         "nsubj:pass", | ||||
|         "obj", | ||||
|         "obl", | ||||
|         "obl:agent", | ||||
|         "obl:arg", | ||||
|         "obl:mod", | ||||
|         "nmod", | ||||
|         "pcomp", | ||||
|         "appos", | ||||
|         "ROOT", | ||||
|     ] | ||||
|     post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"] | ||||
|     """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" | ||||
|     # fmt: off | ||||
|     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] | ||||
|     # fmt: on | ||||
|     doc = doclike.doc  # Ensure works on both Doc and Span. | ||||
|     if not doc.has_annotation("DEP"): | ||||
|         raise ValueError(Errors.E029) | ||||
|     np_deps = {doc.vocab.strings.add(label) for label in labels} | ||||
|     np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} | ||||
|     np_deps = [doc.vocab.strings[label] for label in labels] | ||||
|     conj = doc.vocab.strings.add("conj") | ||||
|     np_label = doc.vocab.strings.add("NP") | ||||
|     adj_label = doc.vocab.strings.add("amod") | ||||
|     det_label = doc.vocab.strings.add("det") | ||||
|     det_pos = doc.vocab.strings.add("DET") | ||||
|     adp_pos = doc.vocab.strings.add("ADP") | ||||
|     conj_label = doc.vocab.strings.add("conj") | ||||
|     conj_pos = doc.vocab.strings.add("CCONJ") | ||||
|     prev_end = -1 | ||||
|     for i, word in enumerate(doclike): | ||||
|         if word.pos not in (NOUN, PROPN, PRON): | ||||
|  | @ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | |||
|         if word.left_edge.i <= prev_end: | ||||
|             continue | ||||
|         if word.dep in np_deps: | ||||
|             right_childs = list(word.rights) | ||||
|             right_child = right_childs[0] if right_childs else None | ||||
| 
 | ||||
|             if right_child: | ||||
|                 if ( | ||||
|                     right_child.dep == adj_label | ||||
|                 ):  # allow chain of adjectives by expanding to right | ||||
|                     right_end = right_child.right_edge | ||||
|                 elif ( | ||||
|                     right_child.dep == det_label and right_child.pos == det_pos | ||||
|                 ):  # cut relative pronouns here | ||||
|                     right_end = right_child | ||||
|                 elif right_child.dep in np_modifs:  # Check if we can expand to right | ||||
|                     right_end = word.right_edge | ||||
|                 else: | ||||
|                     right_end = word | ||||
|             else: | ||||
|                 right_end = word | ||||
|             prev_end = right_end.i | ||||
| 
 | ||||
|             left_index = word.left_edge.i | ||||
|             left_index = ( | ||||
|                 left_index + 1 if word.left_edge.pos == adp_pos else left_index | ||||
|             ) | ||||
| 
 | ||||
|             yield left_index, right_end.i + 1, np_label | ||||
|         elif word.dep == conj_label: | ||||
|             prev_end = word.right_edge.i | ||||
|             yield word.left_edge.i, word.right_edge.i + 1, np_label | ||||
|         elif word.dep == conj: | ||||
|             head = word.head | ||||
|             while head.dep == conj_label and head.head.i < head.i: | ||||
|             while head.dep == conj and head.head.i < head.i: | ||||
|                 head = head.head | ||||
|             # If the head is an NP, and we're coordinated to it, we're an NP | ||||
|             if head.dep in np_deps: | ||||
|                 prev_end = word.i | ||||
| 
 | ||||
|                 left_index = word.left_edge.i  # eliminate left attached conjunction | ||||
|                 left_index = ( | ||||
|                     left_index + 1 if word.left_edge.pos == conj_pos else left_index | ||||
|                 ) | ||||
|                 yield left_index, word.i + 1, np_label | ||||
|                 prev_end = word.right_edge.i | ||||
|                 yield word.left_edge.i, word.right_edge.i + 1, np_label | ||||
| 
 | ||||
| 
 | ||||
| SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} | ||||
|  |  | |||
|  | @ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | |||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES | ||||
| from ...language import Language, BaseDefaults | ||||
| from .lemmatizer import ItalianLemmatizer | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| 
 | ||||
| 
 | ||||
| class ItalianDefaults(BaseDefaults): | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     stop_words = STOP_WORDS | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     stop_words = STOP_WORDS | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
| 
 | ||||
| 
 | ||||
| class Italian(Language): | ||||
|  |  | |||
|  | @ -1,86 +0,0 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|     """ | ||||
|     Detect base noun phrases from a dependency parse. Works on both Doc and Span. | ||||
|     """ | ||||
|     labels = [ | ||||
|         "nsubj", | ||||
|         "nsubj:pass", | ||||
|         "obj", | ||||
|         "obl", | ||||
|         "obl:agent", | ||||
|         "nmod", | ||||
|         "pcomp", | ||||
|         "appos", | ||||
|         "ROOT", | ||||
|     ] | ||||
|     post_modifiers = ["flat", "flat:name", "fixed", "compound"] | ||||
|     dets = ["det", "det:poss"] | ||||
|     doc = doclike.doc  # Ensure works on both Doc and Span. | ||||
|     if not doc.has_annotation("DEP"): | ||||
|         raise ValueError(Errors.E029) | ||||
|     np_deps = {doc.vocab.strings.add(label) for label in labels} | ||||
|     np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} | ||||
|     np_label = doc.vocab.strings.add("NP") | ||||
|     adj_label = doc.vocab.strings.add("amod") | ||||
|     det_labels = {doc.vocab.strings.add(det) for det in dets} | ||||
|     det_pos = doc.vocab.strings.add("DET") | ||||
|     adp_label = doc.vocab.strings.add("ADP") | ||||
|     conj = doc.vocab.strings.add("conj") | ||||
|     conj_pos = doc.vocab.strings.add("CCONJ") | ||||
|     prev_end = -1 | ||||
|     for i, word in enumerate(doclike): | ||||
|         if word.pos not in (NOUN, PROPN, PRON): | ||||
|             continue | ||||
|         # Prevent nested chunks from being produced | ||||
|         if word.left_edge.i <= prev_end: | ||||
|             continue | ||||
|         if word.dep in np_deps: | ||||
|             right_childs = list(word.rights) | ||||
|             right_child = right_childs[0] if right_childs else None | ||||
| 
 | ||||
|             if right_child: | ||||
|                 if ( | ||||
|                     right_child.dep == adj_label | ||||
|                 ):  # allow chain of adjectives by expanding to right | ||||
|                     right_end = right_child.right_edge | ||||
|                 elif ( | ||||
|                     right_child.dep in det_labels and right_child.pos == det_pos | ||||
|                 ):  # cut relative pronouns here | ||||
|                     right_end = right_child | ||||
|                 elif right_child.dep in np_modifs:  # Check if we can expand to right | ||||
|                     right_end = word.right_edge | ||||
|                 else: | ||||
|                     right_end = word | ||||
|             else: | ||||
|                 right_end = word | ||||
|             prev_end = right_end.i | ||||
| 
 | ||||
|             left_index = word.left_edge.i | ||||
|             left_index = ( | ||||
|                 left_index + 1 if word.left_edge.pos == adp_label else left_index | ||||
|             ) | ||||
| 
 | ||||
|             yield left_index, right_end.i + 1, np_label | ||||
|         elif word.dep == conj: | ||||
|             head = word.head | ||||
|             while head.dep == conj and head.head.i < head.i: | ||||
|                 head = head.head | ||||
|             # If the head is an NP, and we're coordinated to it, we're an NP | ||||
|             if head.dep in np_deps: | ||||
|                 prev_end = word.i | ||||
| 
 | ||||
|                 left_index = word.left_edge.i  # eliminate left attached conjunction | ||||
|                 left_index = ( | ||||
|                     left_index + 1 if word.left_edge.pos == conj_pos else left_index | ||||
|                 ) | ||||
|                 yield left_index, word.i + 1, np_label | ||||
| 
 | ||||
| 
 | ||||
| SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} | ||||
|  | @ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av | |||
| 
 | ||||
| bak bare bedre beste blant ble bli blir blitt bris by både | ||||
| 
 | ||||
| da dag de del dem den denne der dermed det dette disse du | ||||
| da dag de del dem den denne der dermed det dette disse drept du | ||||
| 
 | ||||
| eller en enn er et ett etter | ||||
| 
 | ||||
| fem fikk fire fjor flere folk for fortsatt fra fram | ||||
| fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag | ||||
| funnet få får fått før først første | ||||
| 
 | ||||
| gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går | ||||
| 
 | ||||
| ha hadde ham han hans har hele helt henne hennes her hun | ||||
| ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan | ||||
| hvorfor | ||||
| 
 | ||||
| i ifølge igjen ikke ingen inn | ||||
| 
 | ||||
| ja jeg | ||||
| 
 | ||||
| kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld | ||||
| kvinner | ||||
| 
 | ||||
| la laget land landet langt leder ligger like litt løpet | ||||
| la laget land landet langt leder ligger like litt løpet lørdag | ||||
| 
 | ||||
| man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte | ||||
| man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer | ||||
| millioner minutter mot msci mye må mål måtte | ||||
| 
 | ||||
| ned neste noe noen nok ny nye nå når | ||||
| ned neste noe noen nok norge norsk norske ntb ny nye nå når | ||||
| 
 | ||||
| og også om opp opplyser oss over | ||||
| og også om onsdag opp opplyser oslo oss over | ||||
| 
 | ||||
| personer plass poeng på | ||||
| personer plass poeng politidistrikt politiet president prosent på | ||||
| 
 | ||||
| runde rundt | ||||
| regjeringen runde rundt russland | ||||
| 
 | ||||
| sa saken samme sammen samtidig satt se seg seks selv senere ser sett | ||||
| sa saken samme sammen samtidig satt se seg seks selv senere september ser sett | ||||
| siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor | ||||
| store står svært så | ||||
| store står sverige svært så søndag | ||||
| 
 | ||||
| ta tatt tid tidligere til tilbake tillegg tok tror | ||||
| ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror | ||||
| tyskland | ||||
| 
 | ||||
| under ut uten utenfor | ||||
| under usa ut uten utenfor | ||||
| 
 | ||||
| vant var ved veldig vi videre viktig vil ville viser vår være vært | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,10 +1,13 @@ | |||
| # Source: https://github.com/stopwords-iso/stopwords-sl | ||||
| # Removed various words that are not normally considered stop words, such as months. | ||||
| # TODO: probably needs to be tidied up – the list seems to have month names in | ||||
| # it, which shouldn't be considered stop words. | ||||
| 
 | ||||
| STOP_WORDS = set( | ||||
|     """ | ||||
| a | ||||
| ali | ||||
| april | ||||
| avgust | ||||
| b | ||||
| bi | ||||
| bil | ||||
|  | @ -16,6 +19,7 @@ biti | |||
| blizu | ||||
| bo | ||||
| bodo | ||||
| bojo | ||||
| bolj | ||||
| bom | ||||
| bomo | ||||
|  | @ -33,6 +37,16 @@ da | |||
| daleč | ||||
| dan | ||||
| danes | ||||
| datum | ||||
| december | ||||
| deset | ||||
| deseta | ||||
| deseti | ||||
| deseto | ||||
| devet | ||||
| deveta | ||||
| deveti | ||||
| deveto | ||||
| do | ||||
| dober | ||||
| dobra | ||||
|  | @ -40,7 +54,16 @@ dobri | |||
| dobro | ||||
| dokler | ||||
| dol | ||||
| dolg | ||||
| dolga | ||||
| dolgi | ||||
| dovolj | ||||
| drug | ||||
| druga | ||||
| drugi | ||||
| drugo | ||||
| dva | ||||
| dve | ||||
| e | ||||
| eden | ||||
| en | ||||
|  | @ -51,6 +74,7 @@ enkrat | |||
| eno | ||||
| etc. | ||||
| f | ||||
| februar | ||||
| g | ||||
| g. | ||||
| ga | ||||
|  | @ -69,12 +93,16 @@ iv | |||
| ix | ||||
| iz | ||||
| j | ||||
| januar | ||||
| jaz | ||||
| je | ||||
| ji | ||||
| jih | ||||
| jim | ||||
| jo | ||||
| julij | ||||
| junij | ||||
| jutri | ||||
| k | ||||
| kadarkoli | ||||
| kaj | ||||
|  | @ -95,23 +123,41 @@ kje | |||
| kjer | ||||
| kjerkoli | ||||
| ko | ||||
| koder | ||||
| koderkoli | ||||
| koga | ||||
| komu | ||||
| kot | ||||
| kratek | ||||
| kratka | ||||
| kratke | ||||
| kratki | ||||
| l | ||||
| lahka | ||||
| lahke | ||||
| lahki | ||||
| lahko | ||||
| le | ||||
| lep | ||||
| lepa | ||||
| lepe | ||||
| lepi | ||||
| lepo | ||||
| leto | ||||
| m | ||||
| maj | ||||
| majhen | ||||
| majhna | ||||
| majhni | ||||
| malce | ||||
| malo | ||||
| manj | ||||
| marec | ||||
| me | ||||
| med | ||||
| medtem | ||||
| mene | ||||
| mesec | ||||
| mi | ||||
| midva | ||||
| midve | ||||
|  | @ -137,6 +183,7 @@ najmanj | |||
| naju | ||||
| največ | ||||
| nam | ||||
| narobe | ||||
| nas | ||||
| nato | ||||
| nazaj | ||||
|  | @ -145,6 +192,7 @@ naša | |||
| naše | ||||
| ne | ||||
| nedavno | ||||
| nedelja | ||||
| nek | ||||
| neka | ||||
| nekaj | ||||
|  | @ -188,6 +236,7 @@ njuna | |||
| njuno | ||||
| no | ||||
| nocoj | ||||
| november | ||||
| npr. | ||||
| o | ||||
| ob | ||||
|  | @ -195,23 +244,51 @@ oba | |||
| obe | ||||
| oboje | ||||
| od | ||||
| odprt | ||||
| odprta | ||||
| odprti | ||||
| okoli | ||||
| oktober | ||||
| on | ||||
| onadva | ||||
| one | ||||
| oni | ||||
| onidve | ||||
| osem | ||||
| osma | ||||
| osmi | ||||
| osmo | ||||
| oz. | ||||
| p | ||||
| pa | ||||
| pet | ||||
| peta | ||||
| petek | ||||
| peti | ||||
| peto | ||||
| po | ||||
| pod | ||||
| pogosto | ||||
| poleg | ||||
| poln | ||||
| polna | ||||
| polni | ||||
| polno | ||||
| ponavadi | ||||
| ponedeljek | ||||
| ponovno | ||||
| potem | ||||
| povsod | ||||
| pozdravljen | ||||
| pozdravljeni | ||||
| prav | ||||
| prava | ||||
| prave | ||||
| pravi | ||||
| pravo | ||||
| prazen | ||||
| prazna | ||||
| prazno | ||||
| prbl. | ||||
| precej | ||||
| pred | ||||
|  | @ -220,10 +297,19 @@ preko | |||
| pri | ||||
| pribl. | ||||
| približno | ||||
| primer | ||||
| pripravljen | ||||
| pripravljena | ||||
| pripravljeni | ||||
| proti | ||||
| prva | ||||
| prvi | ||||
| prvo | ||||
| r | ||||
| ravno | ||||
| redko | ||||
| res | ||||
| reč | ||||
| s | ||||
| saj | ||||
| sam | ||||
|  | @ -235,17 +321,29 @@ se | |||
| sebe | ||||
| sebi | ||||
| sedaj | ||||
| sedem | ||||
| sedma | ||||
| sedmi | ||||
| sedmo | ||||
| sem | ||||
| september | ||||
| seveda | ||||
| si | ||||
| sicer | ||||
| skoraj | ||||
| skozi | ||||
| slab | ||||
| smo | ||||
| so | ||||
| sobota | ||||
| spet | ||||
| sreda | ||||
| srednja | ||||
| srednji | ||||
| sta | ||||
| ste | ||||
| stran | ||||
| stvar | ||||
| sva | ||||
| t | ||||
| ta | ||||
|  | @ -260,6 +358,10 @@ te | |||
| tebe | ||||
| tebi | ||||
| tega | ||||
| težak | ||||
| težka | ||||
| težki | ||||
| težko | ||||
| ti | ||||
| tista | ||||
| tiste | ||||
|  | @ -269,6 +371,11 @@ tj. | |||
| tja | ||||
| to | ||||
| toda | ||||
| torek | ||||
| tretja | ||||
| tretje | ||||
| tretji | ||||
| tri | ||||
| tu | ||||
| tudi | ||||
| tukaj | ||||
|  | @ -285,6 +392,10 @@ vaša | |||
| vaše | ||||
| ve | ||||
| vedno | ||||
| velik | ||||
| velika | ||||
| veliki | ||||
| veliko | ||||
| vendar | ||||
| ves | ||||
| več | ||||
|  | @ -292,6 +403,10 @@ vi | |||
| vidva | ||||
| vii | ||||
| viii | ||||
| visok | ||||
| visoka | ||||
| visoke | ||||
| visoki | ||||
| vsa | ||||
| vsaj | ||||
| vsak | ||||
|  | @ -305,21 +420,34 @@ vsega | |||
| vsi | ||||
| vso | ||||
| včasih | ||||
| včeraj | ||||
| x | ||||
| z | ||||
| za | ||||
| zadaj | ||||
| zadnji | ||||
| zakaj | ||||
| zaprta | ||||
| zaprti | ||||
| zaprto | ||||
| zdaj | ||||
| zelo | ||||
| zunaj | ||||
| č | ||||
| če | ||||
| često | ||||
| četrta | ||||
| četrtek | ||||
| četrti | ||||
| četrto | ||||
| čez | ||||
| čigav | ||||
| š | ||||
| šest | ||||
| šesta | ||||
| šesti | ||||
| šesto | ||||
| štiri | ||||
| ž | ||||
| že | ||||
| """.split() | ||||
|  |  | |||
|  | @ -155,11 +155,6 @@ def fr_tokenizer(): | |||
|     return get_lang_class("fr")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def fr_vocab(): | ||||
|     return get_lang_class("fr")().vocab | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def ga_tokenizer(): | ||||
|     return get_lang_class("ga")().tokenizer | ||||
|  | @ -210,11 +205,6 @@ def it_tokenizer(): | |||
|     return get_lang_class("it")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def it_vocab(): | ||||
|     return get_lang_class("it")().vocab | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def ja_tokenizer(): | ||||
|     pytest.importorskip("sudachipy") | ||||
|  |  | |||
|  | @ -1,230 +1,8 @@ | |||
| from spacy.tokens import Doc | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| # fmt: off | ||||
| @pytest.mark.parametrize( | ||||
|     "words,heads,deps,pos,chunk_offsets", | ||||
|     [ | ||||
|         # determiner + noun | ||||
|         # un nom -> un nom | ||||
|         ( | ||||
|             ["un", "nom"], | ||||
|             [1, 1], | ||||
|             ["det", "ROOT"], | ||||
|             ["DET", "NOUN"], | ||||
|             [(0, 2)], | ||||
|         ), | ||||
|         # determiner + noun starting with vowel | ||||
|         # l'heure -> l'heure | ||||
|         ( | ||||
|             ["l'", "heure"], | ||||
|             [1, 1], | ||||
|             ["det", "ROOT"], | ||||
|             ["DET", "NOUN"], | ||||
|             [(0, 2)], | ||||
|         ), | ||||
|         # determiner + plural noun | ||||
|         # les romans -> les romans | ||||
|         ( | ||||
|             ["les", "romans"], | ||||
|             [1, 1], | ||||
|             ["det", "ROOT"], | ||||
|             ["DET", "NOUN"], | ||||
|             [(0, 2)], | ||||
|         ), | ||||
|         # det + adj + noun | ||||
|         # Le vieux Londres  -> Le vieux Londres  | ||||
|         ( | ||||
|             ['Les', 'vieux', 'Londres'], | ||||
|             [2, 2, 2], | ||||
|             ["det", "amod", "ROOT"], | ||||
|             ["DET", "ADJ", "NOUN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # det + noun + adj | ||||
|         # le nom propre  -> le nom propre   a proper noun | ||||
|         ( | ||||
|             ["le", "nom", "propre"], | ||||
|             [1, 1, 1], | ||||
|             ["det", "ROOT", "amod"], | ||||
|             ["DET", "NOUN", "ADJ"], | ||||
|             [(0, 3)], | ||||
|         ), | ||||
|         # det + noun + adj plural | ||||
|         # Les chiens bruns  -> les chiens bruns | ||||
|         ( | ||||
|             ["Les", "chiens", "bruns"], | ||||
|             [1, 1, 1], | ||||
|             ["det", "ROOT", "amod"], | ||||
|             ["DET", "NOUN", "ADJ"], | ||||
|             [(0, 3)], | ||||
|         ), | ||||
|         # multiple adjectives: one adj before the noun, one adj after the noun | ||||
|         # un nouveau film intéressant -> un nouveau film intéressant | ||||
|         ( | ||||
|             ["un", "nouveau", "film", "intéressant"], | ||||
|             [2, 2, 2, 2], | ||||
|             ["det", "amod", "ROOT", "amod"], | ||||
|             ["DET", "ADJ", "NOUN", "ADJ"], | ||||
|             [(0,4)] | ||||
|         ), | ||||
|         # multiple adjectives, both adjs after the noun | ||||
|         # une personne intelligente et drôle -> une personne intelligente et drôle | ||||
|         ( | ||||
|             ["une", "personne", "intelligente", "et", "drôle"], | ||||
|             [1, 1, 1, 4, 2], | ||||
|             ["det", "ROOT", "amod", "cc", "conj"], | ||||
|             ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], | ||||
|             [(0,5)] | ||||
|         ), | ||||
|         # relative pronoun | ||||
|         # un bus qui va au ville -> un bus, qui, ville | ||||
|         ( | ||||
|             ['un', 'bus', 'qui', 'va', 'au', 'ville'], | ||||
|             [1, 1, 3, 1, 5, 3], | ||||
|             ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'], | ||||
|             ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'], | ||||
|             [(0,2), (2,3), (5,6)] | ||||
|         ), | ||||
|         # relative subclause | ||||
|         # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy. | ||||
|         ( | ||||
|             ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'], | ||||
|             [0, 2, 0, 5, 5, 2, 5], | ||||
|             ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'], | ||||
|             ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'], | ||||
|             [(1,3), (4,5)] | ||||
|         ), | ||||
|         # Person name and title by flat | ||||
|         # Louis XIV -> Louis XIV | ||||
|         ( | ||||
|             ["Louis", "XIV"], | ||||
|             [0, 0], | ||||
|             ["ROOT", "flat:name"], | ||||
|             ["PROPN", "PROPN"], | ||||
|             [(0,2)] | ||||
|         ), | ||||
|         # Organization name by flat | ||||
|         # Nations Unies -> Nations Unies | ||||
|         ( | ||||
|             ["Nations", "Unies"], | ||||
|             [0, 0], | ||||
|             ["ROOT", "flat:name"], | ||||
|             ["PROPN", "PROPN"], | ||||
|             [(0,2)] | ||||
|         ), | ||||
|         # Noun compound, person name created by two flats | ||||
|         # Louise de Bratagne -> Louise de Bratagne | ||||
|         ( | ||||
|             ["Louise", "de", "Bratagne"], | ||||
|             [0, 0, 0], | ||||
|             ["ROOT", "flat:name", "flat:name"], | ||||
|             ["PROPN", "PROPN", "PROPN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # Noun compound, person name created by two flats | ||||
|         # Louis François Joseph -> Louis François Joseph | ||||
|         ( | ||||
|             ["Louis", "François", "Joseph"], | ||||
|             [0, 0, 0], | ||||
|             ["ROOT", "flat:name", "flat:name"], | ||||
|             ["PROPN", "PROPN", "PROPN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # one determiner + one noun + one adjective qualified by an adverb | ||||
|         # quelques agriculteurs très riches -> quelques agriculteurs très riches | ||||
|         ( | ||||
|             ["quelques", "agriculteurs", "très", "riches"], | ||||
|             [1, 1, 3, 1], | ||||
|             ['det', 'ROOT', 'advmod', 'amod'], | ||||
|             ['DET', 'NOUN', 'ADV', 'ADJ'], | ||||
|             [(0,4)] | ||||
|         ), | ||||
|         # Two NPs conjuncted | ||||
|         # Il a un chien et un chat -> Il, un chien, un chat | ||||
|         (  | ||||
|             ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], | ||||
|             [1, 1, 3, 1, 6, 6, 3], | ||||
|             ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], | ||||
|             ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], | ||||
|             [(0,1), (2,4), (5,7)] | ||||
|           | ||||
|         ), | ||||
|         # Two NPs together | ||||
|         # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado | ||||
|         ( | ||||
|             ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'], | ||||
|             [1, 1, 1, 1, 3], | ||||
|             ['det', 'ROOT', 'amod', 'appos', 'flat:name'], | ||||
|             ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], | ||||
|             [(0, 3), (3, 5)] | ||||
|         ), | ||||
|         # nmod relation between NPs | ||||
|         # la destruction de la ville -> la destruction, la ville | ||||
|         ( | ||||
|             ['la', 'destruction', 'de', 'la', 'ville'], | ||||
|             [1, 1, 4, 4, 1], | ||||
|             ['det', 'ROOT', 'case', 'det', 'nmod'], | ||||
|             ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'], | ||||
|             [(0,2), (3,5)] | ||||
|         ), | ||||
|         # nmod relation between NPs | ||||
|         # Archiduchesse d’Autriche -> Archiduchesse, Autriche | ||||
|         ( | ||||
|             ['Archiduchesse', 'd’', 'Autriche'], | ||||
|             [0, 2, 0], | ||||
|             ['ROOT', 'case', 'nmod'], | ||||
|             ['NOUN', 'ADP', 'PROPN'], | ||||
|             [(0,1), (2,3)] | ||||
|         ), | ||||
|         # Compounding by nmod, several NPs chained together | ||||
|         # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement | ||||
|         ( | ||||
|             ["la", "première", "usine", "de", "drogue", "du", "gouvernement"], | ||||
|             [2, 2, 2, 4, 2, 6, 2], | ||||
|             ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], | ||||
|             ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], | ||||
|             [(0, 3), (4, 5), (6, 7)] | ||||
|         ), | ||||
|         # several NPs | ||||
|         # Traduction du rapport de Susana -> Traduction, rapport, Susana | ||||
|         ( | ||||
|             ['Traduction', 'du', 'raport', 'de', 'Susana'], | ||||
|             [0, 2, 0, 4, 2], | ||||
|             ['ROOT', 'case', 'nmod', 'case', 'nmod'], | ||||
|             ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], | ||||
|             [(0,1), (2,3), (4,5)]   | ||||
|         | ||||
|         ), | ||||
|         # Several NPs | ||||
|         # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie | ||||
|         (   | ||||
|             ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], | ||||
|             [2, 2, 2, 4, 2, 7, 7, 2], | ||||
|             ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], | ||||
|             ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'], | ||||
|             [(0,3), (4,5), (6,8)] | ||||
|         ), | ||||
|         # Passive subject | ||||
|         # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton | ||||
|         ( | ||||
|             ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'], | ||||
|             [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8], | ||||
|             ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'], | ||||
|             ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'], | ||||
|             [(0, 3), (6, 10), (11, 12)] | ||||
|         ) | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
| def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets): | ||||
|     doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos) | ||||
|     assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets | ||||
| 
 | ||||
| 
 | ||||
| def test_noun_chunks_is_parsed_fr(fr_tokenizer): | ||||
|     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" | ||||
|     doc = fr_tokenizer("Je suis allé à l'école") | ||||
|     doc = fr_tokenizer("trouver des travaux antérieurs") | ||||
|     with pytest.raises(ValueError): | ||||
|         list(doc.noun_chunks) | ||||
|  |  | |||
|  | @ -1,221 +0,0 @@ | |||
| from spacy.tokens import Doc | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| # fmt: off | ||||
| @pytest.mark.parametrize( | ||||
|     "words,heads,deps,pos,chunk_offsets", | ||||
|     [ | ||||
|         # determiner + noun | ||||
|         # un pollo -> un pollo | ||||
|         ( | ||||
|             ["un", "pollo"], | ||||
|             [1, 1], | ||||
|             ["det", "ROOT"], | ||||
|             ["DET", "NOUN"], | ||||
|             [(0,2)], | ||||
|         ), | ||||
|         # two determiners + noun | ||||
|         # il mio cane -> il mio cane | ||||
|         ( | ||||
|             ["il", "mio", "cane"], | ||||
|             [2, 2, 2], | ||||
|             ["det", "det:poss", "ROOT"], | ||||
|             ["DET", "DET", "NOUN"], | ||||
|             [(0,3)], | ||||
|         ), | ||||
|         # two determiners, one is after noun. rare usage but still testing | ||||
|         # il cane mio-> il cane mio | ||||
|         ( | ||||
|             ["il", "cane", "mio"], | ||||
|             [1, 1, 1], | ||||
|             ["det", "ROOT", "det:poss"], | ||||
|             ["DET", "NOUN", "DET"], | ||||
|             [(0,3)], | ||||
|         ), | ||||
|         # relative pronoun | ||||
|         # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty. | ||||
|         ( | ||||
|             ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"], | ||||
|             [2, 2, 2, 4, 2, 7, 7, 4], | ||||
|             ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'], | ||||
|             ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'], | ||||
|             [(3,5), (5,6)] | ||||
|         ), | ||||
|         # relative subclause | ||||
|         # il computer che hai comprato -> il computer, che     the computer that you bought | ||||
|         ( | ||||
|             ['il', 'computer', 'che', 'hai', 'comprato'], | ||||
|             [1, 1, 4, 4, 1], | ||||
|             ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'], | ||||
|             ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'], | ||||
|             [(0,2), (2,3)] | ||||
|         ), | ||||
|         # det + noun + adj | ||||
|         # Una macchina grande  -> Una macchina grande | ||||
|         ( | ||||
|             ["Una", "macchina", "grande"], | ||||
|             [1, 1, 1], | ||||
|             ["det", "ROOT", "amod"], | ||||
|             ["DET", "NOUN", "ADJ"], | ||||
|             [(0,3)], | ||||
|         ), | ||||
|         # noun + adj plural | ||||
|         # mucche bianche  | ||||
|         ( | ||||
|             ["mucche", "bianche"], | ||||
|             [0, 0], | ||||
|             ["ROOT", "amod"], | ||||
|             ["NOUN", "ADJ"], | ||||
|             [(0,2)], | ||||
|         ), | ||||
|         # det + adj + noun | ||||
|         # Una grande macchina -> Una grande macchina | ||||
|         ( | ||||
|             ['Una', 'grande', 'macchina'], | ||||
|             [2, 2, 2], | ||||
|             ["det", "amod", "ROOT"], | ||||
|             ["DET", "ADJ", "NOUN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # det + adj + noun, det with apostrophe | ||||
|         # un'importante associazione -> un'importante associazione | ||||
|         ( | ||||
|             ["Un'", 'importante', 'associazione'], | ||||
|             [2, 2, 2], | ||||
|             ["det", "amod", "ROOT"], | ||||
|             ["DET", "ADJ", "NOUN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # multiple adjectives | ||||
|         # Un cane piccolo e marrone -> Un cane piccolo e marrone | ||||
|         ( | ||||
|             ["Un", "cane", "piccolo", "e", "marrone"], | ||||
|             [1, 1, 1, 4, 2], | ||||
|             ["det", "ROOT", "amod", "cc", "conj"], | ||||
|             ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], | ||||
|             [(0,5)] | ||||
|         ), | ||||
|         # determiner, adjective, compound created by flat | ||||
|         # le Nazioni Unite -> le Nazioni Unite | ||||
|         ( | ||||
|             ["le", "Nazioni", "Unite"], | ||||
|             [1, 1, 1], | ||||
|             ["det", "ROOT", "flat:name"], | ||||
|             ["DET", "PROPN", "PROPN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # one determiner + one noun + one adjective qualified by an adverb | ||||
|         # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers | ||||
|         ( | ||||
|             ['alcuni', 'contadini', 'molto', 'ricchi'], | ||||
|             [1, 1, 3, 1], | ||||
|             ['det', 'ROOT', 'advmod', 'amod'], | ||||
|             ['DET', 'NOUN', 'ADV', 'ADJ'], | ||||
|             [(0,4)] | ||||
|         ), | ||||
|         # Two NPs conjuncted | ||||
|         # Ho un cane e un gatto -> un cane, un gatto | ||||
|         (  | ||||
|             ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], | ||||
|             [0, 2, 0, 5, 5, 0], | ||||
|             ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], | ||||
|             ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], | ||||
|             [(1,3), (4,6)] | ||||
|           | ||||
|         ), | ||||
|         # Two NPs together | ||||
|         # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado | ||||
|         ( | ||||
|             ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'], | ||||
|             [1, 1, 1, 1, 3], | ||||
|             ['det', 'ROOT', 'amod', 'nmod', 'flat:name'], | ||||
|             ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], | ||||
|             [(0, 3), (3, 5)] | ||||
|         ), | ||||
|         # Noun compound, person name and titles | ||||
|         # Dom Pedro II -> Dom Pedro II | ||||
|         ( | ||||
|             ["Dom", "Pedro", "II"], | ||||
|             [0, 0, 0], | ||||
|             ["ROOT", "flat:name", "flat:name"], | ||||
|             ["PROPN", "PROPN", "PROPN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # Noun compound created by flat | ||||
|         # gli Stati Uniti | ||||
|         ( | ||||
|             ["gli", "Stati", "Uniti"], | ||||
|             [1, 1, 1], | ||||
|             ["det", "ROOT", "flat:name"], | ||||
|             ["DET", "PROPN", "PROPN"], | ||||
|             [(0,3)] | ||||
|         ), | ||||
|         # nmod relation between NPs | ||||
|         # la distruzione della città -> la distruzione, città | ||||
|         ( | ||||
|             ['la', 'distruzione', 'della', 'città'], | ||||
|             [1, 1, 3, 1], | ||||
|             ['det', 'ROOT', 'case', 'nmod'], | ||||
|             ['DET', 'NOUN', 'ADP', 'NOUN'], | ||||
|             [(0,2), (3,4)] | ||||
|         ), | ||||
|         # Compounding by nmod, several NPs chained together | ||||
|         # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo | ||||
|         ( | ||||
|             ["la", "prima", "fabbrica", "di", "droga", "del", "governo"], | ||||
|             [2, 2, 2, 4, 2, 6, 2], | ||||
|             ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], | ||||
|             ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], | ||||
|             [(0, 3), (4, 5), (6, 7)] | ||||
|         ), | ||||
|         # several NPs | ||||
|         # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana | ||||
|         ( | ||||
|             ['Traduzione', 'del', 'rapporto', 'di', 'Susana'], | ||||
|             [0, 2, 0, 4, 2], | ||||
|             ['ROOT', 'case', 'nmod', 'case', 'nmod'], | ||||
|             ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], | ||||
|             [(0,1), (2,3), (4,5)]   | ||||
|         | ||||
|         ), | ||||
|         # Several NPs | ||||
|         # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica | ||||
|         (   | ||||
|             ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], | ||||
|             [1, 1, 1, 4, 1, 8, 8, 8, 1], | ||||
|             ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], | ||||
|             ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'], | ||||
|             [(0,3), (4,5), (6,9)] | ||||
|         ), | ||||
|         # Passive subject | ||||
|         # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton | ||||
|         ( | ||||
|             ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'], | ||||
|             [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9], | ||||
|             ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'], | ||||
|             ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], | ||||
|             [(0, 3), (6, 8), (9, 10), (11,12)] | ||||
|         ), | ||||
|         # Misc | ||||
|         # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti | ||||
|         ( | ||||
|             ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'], | ||||
|             [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17], | ||||
|             ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'], | ||||
|             ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'], | ||||
|             [(2,4), (9,12), (13,14), (17,18), (19,20)] | ||||
|         ) | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
| def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets): | ||||
|     doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos) | ||||
|     assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets | ||||
| 
 | ||||
| 
 | ||||
| def test_noun_chunks_is_parsed_it(it_tokenizer): | ||||
|     """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed.""" | ||||
|     doc = it_tokenizer("Sei andato a Oxford") | ||||
|     with pytest.raises(ValueError): | ||||
|         list(doc.noun_chunks) | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user