mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Generalize lemmatizer
This commit is contained in:
		
							parent
							
								
									8083a07c3e
								
							
						
					
					
						commit
						82217c6ec6
					
				|  | @ -3,39 +3,6 @@ from os import path | ||||||
| import codecs | import codecs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| NOUN_RULES = ( |  | ||||||
|     ('s', ''), |  | ||||||
|     ('ses', 's'), |  | ||||||
|     ('ves', 'f'), |  | ||||||
|     ('xes', 'x'), |  | ||||||
|     ('zes', 'z'), |  | ||||||
|     ('ches', 'ch'), |  | ||||||
|     ('shes', 'sh'), |  | ||||||
|     ('men', 'man'), |  | ||||||
|     ('ies', 'y') |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| VERB_RULES = ( |  | ||||||
|     ("s", ""), |  | ||||||
|     ("ies", "y"), |  | ||||||
|     ("es", "e"), |  | ||||||
|     ("es", ""), |  | ||||||
|     ("ed", "e"), |  | ||||||
|     ("ed", ""), |  | ||||||
|     ("ing", "e"), |  | ||||||
|     ("ing", "") |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| ADJ_RULES = ( |  | ||||||
|     ("er", ""), |  | ||||||
|     ("est", ""), |  | ||||||
|     ("er", "e"), |  | ||||||
|     ("est", "e") |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class Lemmatizer(object): | class Lemmatizer(object): | ||||||
|     def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): |     def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): | ||||||
|         self.noun_id = noun_id |         self.noun_id = noun_id | ||||||
|  | @ -48,6 +15,8 @@ class Lemmatizer(object): | ||||||
|             self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) |             self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, string, pos): |     def __call__(self, string, pos): | ||||||
|  | 
 | ||||||
|  |         return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) | ||||||
|         if pos == self.noun_id: |         if pos == self.noun_id: | ||||||
|             return self.noun(string) |             return self.noun(string) | ||||||
|         elif pos == self.verb_id: |         elif pos == self.verb_id: | ||||||
|  | @ -58,13 +27,13 @@ class Lemmatizer(object): | ||||||
|             raise Exception("Cannot lemmatize with unknown pos: %s" % pos) |             raise Exception("Cannot lemmatize with unknown pos: %s" % pos) | ||||||
| 
 | 
 | ||||||
|     def noun(self, string): |     def noun(self, string): | ||||||
|         return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) |         return self(string, 'noun') | ||||||
| 
 | 
 | ||||||
|     def verb(self, string): |     def verb(self, string): | ||||||
|         return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) |         return self(string, 'verb') | ||||||
| 
 | 
 | ||||||
|     def adj(self, string): |     def adj(self, string): | ||||||
|         return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) |         return self(string, 'adj') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def lemmatize(string, index, exceptions, rules): | def lemmatize(string, index, exceptions, rules): | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user