mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Change morphology and lemmatizer API
Take morphology features as object instead of keyword arguments
This commit is contained in:
		
							parent
							
								
									52e7d634df
								
							
						
					
					
						commit
						8350d65695
					
				| 
						 | 
					@ -34,7 +34,7 @@ class Lemmatizer(object):
 | 
				
			||||||
        self.exc = exceptions
 | 
					        self.exc = exceptions
 | 
				
			||||||
        self.rules = rules
 | 
					        self.rules = rules
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, string, univ_pos, **morphology):
 | 
					    def __call__(self, string, univ_pos, morphology=None):
 | 
				
			||||||
        if univ_pos == NOUN:
 | 
					        if univ_pos == NOUN:
 | 
				
			||||||
            univ_pos = 'noun'
 | 
					            univ_pos = 'noun'
 | 
				
			||||||
        elif univ_pos == VERB:
 | 
					        elif univ_pos == VERB:
 | 
				
			||||||
| 
						 | 
					@ -44,16 +44,17 @@ class Lemmatizer(object):
 | 
				
			||||||
        elif univ_pos == PUNCT:
 | 
					        elif univ_pos == PUNCT:
 | 
				
			||||||
            univ_pos = 'punct'
 | 
					            univ_pos = 'punct'
 | 
				
			||||||
        # See Issue #435 for example of where this logic is requied.
 | 
					        # See Issue #435 for example of where this logic is requied.
 | 
				
			||||||
        if self.is_base_form(univ_pos, **morphology):
 | 
					        if self.is_base_form(univ_pos, morphology):
 | 
				
			||||||
            return set([string.lower()])
 | 
					            return set([string.lower()])
 | 
				
			||||||
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
 | 
					        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
 | 
				
			||||||
                           self.exc.get(univ_pos, {}),
 | 
					                           self.exc.get(univ_pos, {}),
 | 
				
			||||||
                           self.rules.get(univ_pos, []))
 | 
					                           self.rules.get(univ_pos, []))
 | 
				
			||||||
        return lemmas
 | 
					        return lemmas
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def is_base_form(self, univ_pos, **morphology):
 | 
					    def is_base_form(self, univ_pos, morphology=None):
 | 
				
			||||||
        '''Check whether we're dealing with an uninflected paradigm, so we can
 | 
					        '''Check whether we're dealing with an uninflected paradigm, so we can
 | 
				
			||||||
        avoid lemmatization entirely.'''
 | 
					        avoid lemmatization entirely.'''
 | 
				
			||||||
 | 
					        morphology = {} if morphology is None else morphology
 | 
				
			||||||
        others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
 | 
					        others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
 | 
				
			||||||
        if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
 | 
					        if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
 | 
				
			||||||
            return True
 | 
					            return True
 | 
				
			||||||
| 
						 | 
					@ -62,17 +63,17 @@ class Lemmatizer(object):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def noun(self, string, **morphology):
 | 
					    def noun(self, string, morphology=None):
 | 
				
			||||||
        return self(string, 'noun', **morphology)
 | 
					        return self(string, 'noun', morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def verb(self, string, **morphology):
 | 
					    def verb(self, string, morphology=None):
 | 
				
			||||||
        return self(string, 'verb', **morphology)
 | 
					        return self(string, 'verb', morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def adj(self, string, **morphology):
 | 
					    def adj(self, string, morphology=None):
 | 
				
			||||||
        return self(string, 'adj', **morphology)
 | 
					        return self(string, 'adj', morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def punct(self, string, **morphology):
 | 
					    def punct(self, string, morphology=None):
 | 
				
			||||||
        return self(string, 'punct', **morphology)
 | 
					        return self(string, 'punct', morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def lemmatize(string, index, exceptions, rules):
 | 
					def lemmatize(string, index, exceptions, rules):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,10 +8,27 @@ except ImportError:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .parts_of_speech import IDS as POS_IDS
 | 
					from .parts_of_speech import IDS as POS_IDS
 | 
				
			||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 | 
					from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 | 
				
			||||||
from .attrs cimport IS_SPACE
 | 
					from .attrs cimport POS, IS_SPACE
 | 
				
			||||||
from .lexeme cimport Lexeme
 | 
					from .lexeme cimport Lexeme
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _normalize_props(props):
 | 
				
			||||||
 | 
					    '''Transform deprecated string keys to correct names.'''
 | 
				
			||||||
 | 
					    out = {}
 | 
				
			||||||
 | 
					    for key, value in props.items():
 | 
				
			||||||
 | 
					        if key == POS:
 | 
				
			||||||
 | 
					            if hasattr(value, 'upper'):
 | 
				
			||||||
 | 
					                value = value.upper()
 | 
				
			||||||
 | 
					            if value in POS_IDS:
 | 
				
			||||||
 | 
					                value = POS_IDS[value]
 | 
				
			||||||
 | 
					            out[key] = value
 | 
				
			||||||
 | 
					        elif key.lower() == 'pos':
 | 
				
			||||||
 | 
					            out[POS] = POS_IDS[value.upper()]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            out[key] = value
 | 
				
			||||||
 | 
					    return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Morphology:
 | 
					cdef class Morphology:
 | 
				
			||||||
    def __init__(self, StringStore string_store, tag_map, lemmatizer):
 | 
					    def __init__(self, StringStore string_store, tag_map, lemmatizer):
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
| 
						 | 
					@ -24,10 +41,11 @@ cdef class Morphology:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
 | 
					        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
 | 
				
			||||||
        for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
 | 
					        for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
 | 
				
			||||||
 | 
					            props = _normalize_props(props)
 | 
				
			||||||
            self.rich_tags[i].id = i
 | 
					            self.rich_tags[i].id = i
 | 
				
			||||||
            self.rich_tags[i].name = self.strings[tag_str]
 | 
					            self.rich_tags[i].name = self.strings[tag_str]
 | 
				
			||||||
            self.rich_tags[i].morph = 0
 | 
					            self.rich_tags[i].morph = 0
 | 
				
			||||||
            self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
 | 
					            self.rich_tags[i].pos = props[POS]
 | 
				
			||||||
            self.reverse_index[self.rich_tags[i].name] = i
 | 
					            self.reverse_index[self.rich_tags[i].name] = i
 | 
				
			||||||
        self._cache = PreshMapArray(self.n_tags)
 | 
					        self._cache = PreshMapArray(self.n_tags)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -57,7 +75,7 @@ cdef class Morphology:
 | 
				
			||||||
            analysis.tag = self.rich_tags[tag_id]
 | 
					            analysis.tag = self.rich_tags[tag_id]
 | 
				
			||||||
            tag_str = self.strings[self.rich_tags[tag_id].name]
 | 
					            tag_str = self.strings[self.rich_tags[tag_id].name]
 | 
				
			||||||
            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
 | 
					            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
 | 
				
			||||||
                                            **self.tag_map.get(tag_str, {}))
 | 
					                                            self.tag_map.get(tag_str, {}))
 | 
				
			||||||
            self._cache.set(tag_id, token.lex.orth, analysis)
 | 
					            self._cache.set(tag_id, token.lex.orth, analysis)
 | 
				
			||||||
        token.lemma = analysis.lemma
 | 
					        token.lemma = analysis.lemma
 | 
				
			||||||
        token.pos = analysis.tag.pos
 | 
					        token.pos = analysis.tag.pos
 | 
				
			||||||
| 
						 | 
					@ -97,7 +115,7 @@ cdef class Morphology:
 | 
				
			||||||
                                                  self.tag_map.get(tag_str, {}))
 | 
					                                                  self.tag_map.get(tag_str, {}))
 | 
				
			||||||
                self._cache.set(tag_id, orth, <void*>cached)
 | 
					                self._cache.set(tag_id, orth, <void*>cached)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, **morphology):
 | 
					    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
 | 
				
			||||||
        cdef unicode py_string = self.strings[orth]
 | 
					        cdef unicode py_string = self.strings[orth]
 | 
				
			||||||
        if self.lemmatizer is None:
 | 
					        if self.lemmatizer is None:
 | 
				
			||||||
            return self.strings[py_string.lower()]
 | 
					            return self.strings[py_string.lower()]
 | 
				
			||||||
| 
						 | 
					@ -105,7 +123,7 @@ cdef class Morphology:
 | 
				
			||||||
            return self.strings[py_string.lower()]
 | 
					            return self.strings[py_string.lower()]
 | 
				
			||||||
        cdef set lemma_strings
 | 
					        cdef set lemma_strings
 | 
				
			||||||
        cdef unicode lemma_string
 | 
					        cdef unicode lemma_string
 | 
				
			||||||
        lemma_strings = self.lemmatizer(py_string, univ_pos, **morphology)
 | 
					        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
 | 
				
			||||||
        lemma_string = sorted(lemma_strings)[0]
 | 
					        lemma_string = sorted(lemma_strings)[0]
 | 
				
			||||||
        lemma = self.strings[lemma_string]
 | 
					        lemma = self.strings[lemma_string]
 | 
				
			||||||
        return lemma
 | 
					        return lemma
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -60,8 +60,8 @@ def test_base_form_dive(lemmatizer):
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    do = lemmatizer.noun
 | 
					    do = lemmatizer.noun
 | 
				
			||||||
    assert do('dive', number='sing') == set(['dive'])
 | 
					    assert do('dive', {'number': 'sing'}) == set(['dive'])
 | 
				
			||||||
    assert do('dive', number='plur') == set(['diva'])
 | 
					    assert do('dive', {'number': 'plur'}) == set(['diva'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_base_form_saw(lemmatizer):
 | 
					def test_base_form_saw(lemmatizer):
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@ def test_base_form_saw(lemmatizer):
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    do = lemmatizer.verb
 | 
					    do = lemmatizer.verb
 | 
				
			||||||
    assert do('saw', verbform='past') == set(['see'])
 | 
					    assert do('saw', {'verbform': 'past'}) == set(['see'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_smart_quotes(lemmatizer):
 | 
					def test_smart_quotes(lemmatizer):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user