mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	adds Croatian lemma_lookup.json, license file and corresponding tests (#4252)
This commit is contained in:
		
							parent
							
								
									aec755d3a3
								
							
						
					
					
						commit
						b01025dd06
					
				|  | @ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults): | ||||||
|     ) |     ) | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
|  |     resources = {"lemma_lookup": "lemma_lookup.json"} | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Croatian(Language): | class Croatian(Language): | ||||||
|  |  | ||||||
							
								
								
									
										1313609
									
								
								spacy/lang/hr/lemma_lookup.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1313609
									
								
								spacy/lang/hr/lemma_lookup.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										15
									
								
								spacy/lang/hr/lemma_lookup_license.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/hr/lemma_lookup_license.txt
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | ||||||
|  | The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger). | ||||||
|  | Reldi-tagger is licesned under the Apache 2.0 licence. | ||||||
|  | 
 | ||||||
|  | @InProceedings{ljubesic16-new, | ||||||
|  |   author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec}, | ||||||
|  |   title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian}, | ||||||
|  |   booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, | ||||||
|  |   year = {2016}, | ||||||
|  |   date = {23-28}, | ||||||
|  |   location = {Portorož, Slovenia}, | ||||||
|  |   editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, | ||||||
|  |   publisher = {European Language Resources Association (ELRA)}, | ||||||
|  |   address = {Paris, France}, | ||||||
|  |   isbn = {978-2-9517408-9-1} | ||||||
|  |  } | ||||||
|  | @ -103,6 +103,11 @@ def he_tokenizer(): | ||||||
|     return get_lang_class("he").Defaults.create_tokenizer() |     return get_lang_class("he").Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def hr_tokenizer(): | ||||||
|  |     return get_lang_class("hr").Defaults.create_tokenizer() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def hu_tokenizer(): | def hu_tokenizer(): | ||||||
|     return get_lang_class("hu").Defaults.create_tokenizer() |     return get_lang_class("hu").Defaults.create_tokenizer() | ||||||
|  |  | ||||||
							
								
								
									
										20
									
								
								spacy/tests/lang/hr/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/tests/lang/hr/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize( | ||||||
|  |     "string,lemma", | ||||||
|  |     [ | ||||||
|  |         ("trčao", "trčati"), | ||||||
|  |         ("adekvatnim", "adekvatan"), | ||||||
|  |         ("dekontaminacijama", "dekontaminacija"), | ||||||
|  |         ("filologovih", "filologov"), | ||||||
|  |         ("je", "biti"), | ||||||
|  |         ("se", "sebe"), | ||||||
|  |     ], | ||||||
|  | ) | ||||||
|  | def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma): | ||||||
|  |     tokens = hr_tokenizer(string) | ||||||
|  |     assert tokens[0].lemma_ == lemma | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user