mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue #130
This commit is contained in:
		
							parent
							
								
									b71ba2eed5
								
							
						
					
					
						commit
						5332c0b697
					
				|  | @ -27,5 +27,11 @@ | |||
|         ["est", ""], | ||||
|         ["er", "e"], | ||||
|         ["est", "e"] | ||||
|     ], | ||||
| 
 | ||||
|     "punct": [ | ||||
|         ["“", "``"], | ||||
|         ["”", "''"], | ||||
|         ["–", "--"] | ||||
|     ] | ||||
| } | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| from __future__ import unicode_literals | ||||
| from __future__ import unicode_literals, print_function | ||||
| from os import path | ||||
| import codecs | ||||
| 
 | ||||
|  | @ -7,7 +7,7 @@ try: | |||
| except ImportError: | ||||
|     import json | ||||
| 
 | ||||
| from .parts_of_speech import NOUN, VERB, ADJ | ||||
| from .parts_of_speech import NOUN, VERB, ADJ, PUNCT | ||||
| 
 | ||||
| 
 | ||||
| class Lemmatizer(object): | ||||
|  | @ -36,6 +36,8 @@ class Lemmatizer(object): | |||
|             pos = 'verb' | ||||
|         elif pos == ADJ: | ||||
|             pos = 'adj' | ||||
|         elif pos == PUNCT: | ||||
|             pos = 'punct' | ||||
|         lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) | ||||
|         return lemmas | ||||
| 
 | ||||
|  | @ -48,6 +50,9 @@ class Lemmatizer(object): | |||
|     def adj(self, string): | ||||
|         return self(string, 'adj') | ||||
| 
 | ||||
|     def punct(self, string): | ||||
|         return self(string, 'punct') | ||||
| 
 | ||||
| 
 | ||||
| def lemmatize(string, index, exceptions, rules): | ||||
|     string = string.lower() | ||||
|  | @ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules): | |||
|     for old, new in rules: | ||||
|         if string.endswith(old): | ||||
|             form = string[:len(string) - len(old)] + new | ||||
|             if form in index: | ||||
|             if form in index or not form.isalpha(): | ||||
|                 forms.append(form) | ||||
|     if not forms: | ||||
|         forms.append(string) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user