mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-29 17:33:10 +03:00
Change morphology and lemmatizer API
Take morphology features as object instead of keyword arguments
This commit is contained in:
parent
52e7d634df
commit
8350d65695
|
@ -34,7 +34,7 @@ class Lemmatizer(object):
|
||||||
self.exc = exceptions
|
self.exc = exceptions
|
||||||
self.rules = rules
|
self.rules = rules
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, **morphology):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
if univ_pos == NOUN:
|
if univ_pos == NOUN:
|
||||||
univ_pos = 'noun'
|
univ_pos = 'noun'
|
||||||
elif univ_pos == VERB:
|
elif univ_pos == VERB:
|
||||||
|
@ -44,16 +44,17 @@ class Lemmatizer(object):
|
||||||
elif univ_pos == PUNCT:
|
elif univ_pos == PUNCT:
|
||||||
univ_pos = 'punct'
|
univ_pos = 'punct'
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, **morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return set([string.lower()])
|
return set([string.lower()])
|
||||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
||||||
self.exc.get(univ_pos, {}),
|
self.exc.get(univ_pos, {}),
|
||||||
self.rules.get(univ_pos, []))
|
self.rules.get(univ_pos, []))
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, **morphology):
|
def is_base_form(self, univ_pos, morphology=None):
|
||||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
'''Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.'''
|
avoid lemmatization entirely.'''
|
||||||
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
|
||||||
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
||||||
return True
|
return True
|
||||||
|
@ -62,17 +63,17 @@ class Lemmatizer(object):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def noun(self, string, **morphology):
|
def noun(self, string, morphology=None):
|
||||||
return self(string, 'noun', **morphology)
|
return self(string, 'noun', morphology)
|
||||||
|
|
||||||
def verb(self, string, **morphology):
|
def verb(self, string, morphology=None):
|
||||||
return self(string, 'verb', **morphology)
|
return self(string, 'verb', morphology)
|
||||||
|
|
||||||
def adj(self, string, **morphology):
|
def adj(self, string, morphology=None):
|
||||||
return self(string, 'adj', **morphology)
|
return self(string, 'adj', morphology)
|
||||||
|
|
||||||
def punct(self, string, **morphology):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, 'punct', **morphology)
|
return self(string, 'punct', morphology)
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
|
|
|
@ -8,10 +8,27 @@ except ImportError:
|
||||||
|
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
||||||
from .attrs cimport IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_props(props):
|
||||||
|
'''Transform deprecated string keys to correct names.'''
|
||||||
|
out = {}
|
||||||
|
for key, value in props.items():
|
||||||
|
if key == POS:
|
||||||
|
if hasattr(value, 'upper'):
|
||||||
|
value = value.upper()
|
||||||
|
if value in POS_IDS:
|
||||||
|
value = POS_IDS[value]
|
||||||
|
out[key] = value
|
||||||
|
elif key.lower() == 'pos':
|
||||||
|
out[POS] = POS_IDS[value.upper()]
|
||||||
|
else:
|
||||||
|
out[key] = value
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -24,10 +41,11 @@ cdef class Morphology:
|
||||||
|
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||||
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||||
|
props = _normalize_props(props)
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings[tag_str]
|
self.rich_tags[i].name = self.strings[tag_str]
|
||||||
self.rich_tags[i].morph = 0
|
self.rich_tags[i].morph = 0
|
||||||
self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
|
self.rich_tags[i].pos = props[POS]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
|
||||||
|
@ -57,7 +75,7 @@ cdef class Morphology:
|
||||||
analysis.tag = self.rich_tags[tag_id]
|
analysis.tag = self.rich_tags[tag_id]
|
||||||
tag_str = self.strings[self.rich_tags[tag_id].name]
|
tag_str = self.strings[self.rich_tags[tag_id].name]
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||||
**self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
token.lemma = analysis.lemma
|
token.lemma = analysis.lemma
|
||||||
token.pos = analysis.tag.pos
|
token.pos = analysis.tag.pos
|
||||||
|
@ -97,7 +115,7 @@ cdef class Morphology:
|
||||||
self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, orth, <void*>cached)
|
self._cache.set(tag_id, orth, <void*>cached)
|
||||||
|
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, **morphology):
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||||
cdef unicode py_string = self.strings[orth]
|
cdef unicode py_string = self.strings[orth]
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return self.strings[py_string.lower()]
|
return self.strings[py_string.lower()]
|
||||||
|
@ -105,7 +123,7 @@ cdef class Morphology:
|
||||||
return self.strings[py_string.lower()]
|
return self.strings[py_string.lower()]
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string, univ_pos, **morphology)
|
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
lemma = self.strings[lemma_string]
|
lemma = self.strings[lemma_string]
|
||||||
return lemma
|
return lemma
|
||||||
|
|
|
@ -60,8 +60,8 @@ def test_base_form_dive(lemmatizer):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
do = lemmatizer.noun
|
do = lemmatizer.noun
|
||||||
assert do('dive', number='sing') == set(['dive'])
|
assert do('dive', {'number': 'sing'}) == set(['dive'])
|
||||||
assert do('dive', number='plur') == set(['diva'])
|
assert do('dive', {'number': 'plur'}) == set(['diva'])
|
||||||
|
|
||||||
|
|
||||||
def test_base_form_saw(lemmatizer):
|
def test_base_form_saw(lemmatizer):
|
||||||
|
@ -69,7 +69,7 @@ def test_base_form_saw(lemmatizer):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
do = lemmatizer.verb
|
do = lemmatizer.verb
|
||||||
assert do('saw', verbform='past') == set(['see'])
|
assert do('saw', {'verbform': 'past'}) == set(['see'])
|
||||||
|
|
||||||
|
|
||||||
def test_smart_quotes(lemmatizer):
|
def test_smart_quotes(lemmatizer):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user