mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Lemmatizer ro (#2319)
* Add Romanian lemmatizer lookup table. Adapted from http://www.lexiconista.com/datasets/lemmatization/ by replacing cedillas with commas (ș and ț). The original dataset is licensed under the Open Database License. * Fix one blatant issue in the Romanian lemmatizer * Romanian examples file * Add ro_tokenizer in conftest * Add Romanian lemmatizer test
This commit is contained in:
		
							parent
							
								
									ae3719ece5
								
							
						
					
					
						commit
						0e08e49e87
					
				|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
|  | from .lemmatizer import LOOKUP | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
|  | @ -17,6 +18,7 @@ class RomanianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
|  |     lemma_lookup = LOOKUP | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Romanian(Language): | class Romanian(Language): | ||||||
|  |  | ||||||
							
								
								
									
										23
									
								
								spacy/lang/ro/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/lang/ro/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Example sentences to test spaCy and its language models. | ||||||
|  | 
 | ||||||
|  | >>> from spacy.lang.ro import Romanian | ||||||
|  | >>> from spacy.lang.ro.examples import sentences | ||||||
|  | >>> nlp = Romanian() | ||||||
|  | >>> docs = nlp.pipe(sentences) | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | sentences = [ | ||||||
|  |     "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", | ||||||
|  |     "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar", | ||||||
|  |     "Londra este un oraș mare în Regatul Unit", | ||||||
|  |     "Unde ești?", | ||||||
|  |     "Cine este președintele Franței?", | ||||||
|  |     "Care este capitala Statelor Unite?", | ||||||
|  |     "Când s-a născut Barack Obama?" | ||||||
|  | ] | ||||||
							
								
								
									
										314816
									
								
								spacy/lang/ro/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										314816
									
								
								spacy/lang/ro/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -15,7 +15,7 @@ from .. import util | ||||||
| # here if it's using spaCy's tokenizer (not a different library) | # here if it's using spaCy's tokenizer (not a different library) | ||||||
| # TODO: re-implement generic tokenizer tests | # TODO: re-implement generic tokenizer tests | ||||||
| _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', | _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', | ||||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', 'xx'] |               'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'xx'] | ||||||
| _models = {'en': ['en_core_web_sm'], | _models = {'en': ['en_core_web_sm'], | ||||||
|            'de': ['de_core_news_md'], |            'de': ['de_core_news_md'], | ||||||
|            'fr': ['fr_core_news_sm'], |            'fr': ['fr_core_news_sm'], | ||||||
|  | @ -100,6 +100,11 @@ def fi_tokenizer(): | ||||||
|     return util.get_lang_class('fi').Defaults.create_tokenizer() |     return util.get_lang_class('fi').Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def ro_tokenizer(): | ||||||
|  |     return util.get_lang_class('ro').Defaults.create_tokenizer() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def id_tokenizer(): | def id_tokenizer(): | ||||||
|     return util.get_lang_class('id').Defaults.create_tokenizer() |     return util.get_lang_class('id').Defaults.create_tokenizer() | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/ro/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/ro/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										13
									
								
								spacy/tests/lang/ro/test_lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/ro/test_lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,13 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('string,lemma', [('câini', 'câine'), | ||||||
|  |                                           ('expedițiilor', 'expediție'), | ||||||
|  |                                           ('pensete', 'pensetă'), | ||||||
|  |                                           ('erau', 'fi')]) | ||||||
|  | def test_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma): | ||||||
|  |     tokens = ro_tokenizer(string) | ||||||
|  |     assert tokens[0].lemma_ == lemma | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user