From 8350d65695e1919fa34ff1993e1598e21337cf74 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 7 Dec 2016 21:12:49 +0100 Subject: [PATCH] Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments --- spacy/lemmatizer.py | 23 ++++++++++--------- spacy/morphology.pyx | 32 +++++++++++++++++++++------ spacy/tests/tagger/test_lemmatizer.py | 6 ++--- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 17cb28233..8aee14717 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -34,7 +34,7 @@ class Lemmatizer(object): self.exc = exceptions self.rules = rules - def __call__(self, string, univ_pos, **morphology): + def __call__(self, string, univ_pos, morphology=None): if univ_pos == NOUN: univ_pos = 'noun' elif univ_pos == VERB: @@ -44,16 +44,17 @@ class Lemmatizer(object): elif univ_pos == PUNCT: univ_pos = 'punct' # See Issue #435 for example of where this logic is requied. - if self.is_base_form(univ_pos, **morphology): + if self.is_base_form(univ_pos, morphology): return set([string.lower()]) lemmas = lemmatize(string, self.index.get(univ_pos, {}), self.exc.get(univ_pos, {}), self.rules.get(univ_pos, [])) return lemmas - def is_base_form(self, univ_pos, **morphology): + def is_base_form(self, univ_pos, morphology=None): '''Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely.''' + morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in ('number', 'pos', 'verbform')] if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: return True @@ -62,17 +63,17 @@ class Lemmatizer(object): else: return False - def noun(self, string, **morphology): - return self(string, 'noun', **morphology) + def noun(self, string, morphology=None): + return self(string, 'noun', morphology) - def verb(self, string, **morphology): - return self(string, 'verb', **morphology) + def verb(self, string, morphology=None): + return self(string, 'verb', morphology) - def adj(self, string, **morphology): - return self(string, 'adj', **morphology) + def adj(self, string, morphology=None): + return self(string, 'adj', morphology) - def punct(self, string, **morphology): - return self(string, 'punct', **morphology) + def punct(self, string, morphology=None): + return self(string, 'punct', morphology) def lemmatize(string, index, exceptions, rules): diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e5e5e013f..a9c785d3a 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -8,10 +8,27 @@ except ImportError: from .parts_of_speech import IDS as POS_IDS from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT -from .attrs cimport IS_SPACE +from .attrs cimport POS, IS_SPACE from .lexeme cimport Lexeme +def _normalize_props(props): + '''Transform deprecated string keys to correct names.''' + out = {} + for key, value in props.items(): + if key == POS: + if hasattr(value, 'upper'): + value = value.upper() + if value in POS_IDS: + value = POS_IDS[value] + out[key] = value + elif key.lower() == 'pos': + out[POS] = POS_IDS[value.upper()] + else: + out[key] = value + return out + + cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() @@ -21,13 +38,14 @@ cdef class Morphology: self.n_tags = len(tag_map) + 1 self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} - + self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) for i, (tag_str, props) in enumerate(sorted(tag_map.items())): + props = _normalize_props(props) self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = POS_IDS[props['pos'].upper()] + self.rich_tags[i].pos = props[POS] self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) @@ -57,7 +75,7 @@ cdef class Morphology: analysis.tag = self.rich_tags[tag_id] tag_str = self.strings[self.rich_tags[tag_id].name] analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, - **self.tag_map.get(tag_str, {})) + self.tag_map.get(tag_str, {})) self._cache.set(tag_id, token.lex.orth, analysis) token.lemma = analysis.lemma token.pos = analysis.tag.pos @@ -81,7 +99,7 @@ cdef class Morphology: cdef RichTagC rich_tag for tag_str, entries in exc.items(): tag = self.strings[tag_str] - tag_id = self.reverse_index[tag] + tag_id = self.reverse_index[tag] rich_tag = self.rich_tags[tag_id] for form_str, props in entries.items(): cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) @@ -97,7 +115,7 @@ cdef class Morphology: self.tag_map.get(tag_str, {})) self._cache.set(tag_id, orth, cached) - def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, **morphology): + def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings[py_string.lower()] @@ -105,7 +123,7 @@ cdef class Morphology: return self.strings[py_string.lower()] cdef set lemma_strings cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, univ_pos, **morphology) + lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_string = sorted(lemma_strings)[0] lemma = self.strings[lemma_string] return lemma diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 8ab4422a3..31acc72e3 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -60,8 +60,8 @@ def test_base_form_dive(lemmatizer): return None do = lemmatizer.noun - assert do('dive', number='sing') == set(['dive']) - assert do('dive', number='plur') == set(['diva']) + assert do('dive', {'number': 'sing'}) == set(['dive']) + assert do('dive', {'number': 'plur'}) == set(['diva']) def test_base_form_saw(lemmatizer): @@ -69,7 +69,7 @@ def test_base_form_saw(lemmatizer): return None do = lemmatizer.verb - assert do('saw', verbform='past') == set(['see']) + assert do('saw', {'verbform': 'past'}) == set(['see']) def test_smart_quotes(lemmatizer):