mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			59 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			59 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import List
 | |
| from collections import OrderedDict
 | |
| 
 | |
| from ...pipeline import Lemmatizer
 | |
| from ...tokens import Token
 | |
| 
 | |
| 
 | |
| class MacedonianLemmatizer(Lemmatizer):
 | |
|     def rule_lemmatize(self, token: Token) -> List[str]:
 | |
|         string = token.text
 | |
|         univ_pos = token.pos_.lower()
 | |
| 
 | |
|         if univ_pos in ("", "eol", "space"):
 | |
|             return [string.lower()]
 | |
| 
 | |
|         if string[-3:] == "јќи":
 | |
|             string = string[:-3]
 | |
|             univ_pos = "verb"
 | |
| 
 | |
|         index_table = self.lookups.get_table("lemma_index", {})
 | |
|         exc_table = self.lookups.get_table("lemma_exc", {})
 | |
|         rules_table = self.lookups.get_table("lemma_rules", {})
 | |
|         if not any(
 | |
|             (
 | |
|                 index_table.get(univ_pos),
 | |
|                 exc_table.get(univ_pos),
 | |
|                 rules_table.get(univ_pos),
 | |
|             )
 | |
|         ):
 | |
|             if univ_pos == "propn":
 | |
|                 return [string]
 | |
|             else:
 | |
|                 return [string.lower()]
 | |
| 
 | |
|         index = index_table.get(univ_pos, {})
 | |
|         exceptions = exc_table.get(univ_pos, {})
 | |
|         rules = rules_table.get(univ_pos, [])
 | |
| 
 | |
|         orig = string
 | |
|         string = string.lower()
 | |
|         forms = []
 | |
| 
 | |
|         for old, new in rules:
 | |
|             if string.endswith(old):
 | |
|                 form = string[: len(string) - len(old)] + new
 | |
|                 if not form:
 | |
|                     continue
 | |
|                 if form in index or not form.isalpha():
 | |
|                     forms.append(form)
 | |
| 
 | |
|         forms = list(OrderedDict.fromkeys(forms))
 | |
|         for form in exceptions.get(string, []):
 | |
|             if form not in forms:
 | |
|                 forms.insert(0, form)
 | |
|         if not forms:
 | |
|             forms.append(orig)
 | |
| 
 | |
|         return forms
 |