mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Add WordNet lemmatizer
This commit is contained in:
		
							parent
							
								
									c20dd79748
								
							
						
					
					
						commit
						7b68f911cf
					
				
							
								
								
									
										87
									
								
								spacy/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								spacy/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,87 @@
 | 
				
			||||||
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NOUN_RULES = (
 | 
				
			||||||
 | 
					    ('s', ''),
 | 
				
			||||||
 | 
					    ('ses', 's'),
 | 
				
			||||||
 | 
					    ('ves', 'f'),
 | 
				
			||||||
 | 
					    ('xes', 'x'),
 | 
				
			||||||
 | 
					    ('zes', 'z'),
 | 
				
			||||||
 | 
					    ('ches', 'ch'),
 | 
				
			||||||
 | 
					    ('shes', 'sh'),
 | 
				
			||||||
 | 
					    ('men', 'man'),
 | 
				
			||||||
 | 
					    ('ies', 'y')
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					VERB_RULES = (
 | 
				
			||||||
 | 
					    ("s", ""),
 | 
				
			||||||
 | 
					    ("ies", "y"),
 | 
				
			||||||
 | 
					    ("es", "e"),
 | 
				
			||||||
 | 
					    ("es", ""),
 | 
				
			||||||
 | 
					    ("ed", "e"),
 | 
				
			||||||
 | 
					    ("ed", ""),
 | 
				
			||||||
 | 
					    ("ing", "e"),
 | 
				
			||||||
 | 
					    ("ing", "")
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ADJ_RULES = (
 | 
				
			||||||
 | 
					    ("er", ""),
 | 
				
			||||||
 | 
					    ("est", ""),
 | 
				
			||||||
 | 
					    ("er", "e"),
 | 
				
			||||||
 | 
					    ("est", "e")
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Lemmatizer(object):
 | 
				
			||||||
 | 
					    def __init__(self, wn_dict_dir):
 | 
				
			||||||
 | 
					        self.index = {}
 | 
				
			||||||
 | 
					        self.exc = {}
 | 
				
			||||||
 | 
					        for pos in ['adj', 'adv', 'noun', 'verb']:
 | 
				
			||||||
 | 
					            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
 | 
				
			||||||
 | 
					            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def noun(self, string):
 | 
				
			||||||
 | 
					        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def verb(self, string):
 | 
				
			||||||
 | 
					        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def adj(self, string):
 | 
				
			||||||
 | 
					        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def lemmatize(string, index, exceptions, rules):
 | 
				
			||||||
 | 
					    forms = []
 | 
				
			||||||
 | 
					    if string in index:
 | 
				
			||||||
 | 
					        forms.append(string)
 | 
				
			||||||
 | 
					    forms.extend(exceptions.get(string, []))
 | 
				
			||||||
 | 
					    for old, new in rules:
 | 
				
			||||||
 | 
					        if string.endswith(old):
 | 
				
			||||||
 | 
					            form = string[:len(string) - len(old)] + new
 | 
				
			||||||
 | 
					            if form in index:
 | 
				
			||||||
 | 
					                forms.append(form)
 | 
				
			||||||
 | 
					    return set(forms)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def read_index(loc):
 | 
				
			||||||
 | 
					    index = set()
 | 
				
			||||||
 | 
					    for line in open(loc):
 | 
				
			||||||
 | 
					        if line.startswith(' '):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        pieces = line.split()
 | 
				
			||||||
 | 
					        word = pieces[0]
 | 
				
			||||||
 | 
					        if word.count('_') == 0:
 | 
				
			||||||
 | 
					            index.add(word)
 | 
				
			||||||
 | 
					    return index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def read_exc(loc):
 | 
				
			||||||
 | 
					    exceptions = {}
 | 
				
			||||||
 | 
					    for line in open(loc):
 | 
				
			||||||
 | 
					        if line.startswith(' '):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        pieces = line.split()
 | 
				
			||||||
 | 
					        exceptions[pieces[0]] = tuple(pieces[1:])
 | 
				
			||||||
 | 
					    return exceptions
 | 
				
			||||||
							
								
								
									
										34
									
								
								tests/test_lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								tests/test_lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,34 @@
 | 
				
			||||||
 | 
					from spacy.lemmatizer import Lemmatizer, read_index, read_exc
 | 
				
			||||||
 | 
					from spacy.util import DATA_DIR
 | 
				
			||||||
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_read_index():
 | 
				
			||||||
 | 
					    wn = path.join(DATA_DIR, 'wordnet')
 | 
				
			||||||
 | 
					    index = read_index(path.join(wn, 'index.noun'))
 | 
				
			||||||
 | 
					    assert 'man' in index
 | 
				
			||||||
 | 
					    assert 'plantes' not in index
 | 
				
			||||||
 | 
					    assert 'plant' in index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_read_exc():
 | 
				
			||||||
 | 
					    wn = path.join(DATA_DIR, 'wordnet')
 | 
				
			||||||
 | 
					    exc = read_exc(path.join(wn, 'verb.exc'))
 | 
				
			||||||
 | 
					    assert exc['was'] == ('be',)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def lemmatizer():
 | 
				
			||||||
 | 
					    return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_noun_lemmas(lemmatizer):
 | 
				
			||||||
 | 
					    do = lemmatizer.noun
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert do('aardwolves') == set(['aardwolf'])
 | 
				
			||||||
 | 
					    assert do('aardwolf') == set(['aardwolf'])
 | 
				
			||||||
 | 
					    assert do('planets') == set(['planet'])
 | 
				
			||||||
 | 
					    assert do('ring') == set(['ring'])
 | 
				
			||||||
 | 
					    assert do('axes') == set(['axis', 'axe', 'ax'])
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user