Fix issue #3551: Upper case lemmas

If the Morphology class tries to lemmatize a word that's not in the
string store, it's forced to just return it as-is. While loading
exceptions, the class could hit a case where these strings weren't in
the string store yet. The resulting lemmas could then be cached, leading
to some words receiving upper-case lemmas. Closes #3551.
This commit is contained in:
Matthew Honnibal 2019-04-16 12:27:15 +02:00
parent cc1516ec26
commit d59b2e8a0c

View File

@ -109,6 +109,7 @@ cdef class Morphology:
analysis.tag = rich_tag analysis.tag = rich_tag
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
self.tag_map.get(tag_str, {})) self.tag_map.get(tag_str, {}))
self._cache.set(tag_id, token.lex.orth, analysis) self._cache.set(tag_id, token.lex.orth, analysis)
if token.lemma == 0: if token.lemma == 0:
token.lemma = analysis.lemma token.lemma = analysis.lemma
@ -140,7 +141,7 @@ cdef class Morphology:
if tag not in self.reverse_index: if tag not in self.reverse_index:
return return
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings.add(orth_str)
cdef RichTagC rich_tag = self.rich_tags[tag_id] cdef RichTagC rich_tag = self.rich_tags[tag_id]
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth) cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)