mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Lemmatizer ro (#2319)
* Add Romanian lemmatizer lookup table. Adapted from http://www.lexiconista.com/datasets/lemmatization/ by replacing cedillas with commas (ș and ț). The original dataset is licensed under the Open Database License. * Fix one blatant issue in the Romanian lemmatizer * Romanian examples file * Add ro_tokenizer in conftest * Add Romanian lemmatizer test
This commit is contained in:
		
							parent
							
								
									ae3719ece5
								
							
						
					
					
						commit
						0e08e49e87
					
				|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LOOKUP | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
|  | @ -17,6 +18,7 @@ class RomanianDefaults(Language.Defaults): | |||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = STOP_WORDS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Romanian(Language): | ||||
|  |  | |||
							
								
								
									
										23
									
								
								spacy/lang/ro/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/lang/ro/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.ro import Romanian | ||||
| >>> from spacy.lang.ro.examples import sentences | ||||
| >>> nlp = Romanian() | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", | ||||
|     "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar", | ||||
|     "Londra este un oraș mare în Regatul Unit", | ||||
|     "Unde ești?", | ||||
|     "Cine este președintele Franței?", | ||||
|     "Care este capitala Statelor Unite?", | ||||
|     "Când s-a născut Barack Obama?" | ||||
| ] | ||||
							
								
								
									
										314816
									
								
								spacy/lang/ro/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										314816
									
								
								spacy/lang/ro/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -15,7 +15,7 @@ from .. import util | |||
| # here if it's using spaCy's tokenizer (not a different library) | ||||
| # TODO: re-implement generic tokenizer tests | ||||
| _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', | ||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', 'xx'] | ||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'xx'] | ||||
| _models = {'en': ['en_core_web_sm'], | ||||
|            'de': ['de_core_news_md'], | ||||
|            'fr': ['fr_core_news_sm'], | ||||
|  | @ -100,6 +100,11 @@ def fi_tokenizer(): | |||
|     return util.get_lang_class('fi').Defaults.create_tokenizer() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def ro_tokenizer(): | ||||
|     return util.get_lang_class('ro').Defaults.create_tokenizer() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def id_tokenizer(): | ||||
|     return util.get_lang_class('id').Defaults.create_tokenizer() | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/ro/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/ro/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										13
									
								
								spacy/tests/lang/ro/test_lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/ro/test_lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,13 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('string,lemma', [('câini', 'câine'), | ||||
|                                           ('expedițiilor', 'expediție'), | ||||
|                                           ('pensete', 'pensetă'), | ||||
|                                           ('erau', 'fi')]) | ||||
| def test_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma): | ||||
|     tokens = ro_tokenizer(string) | ||||
|     assert tokens[0].lemma_ == lemma | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user