mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Fix issue #3551: Upper case lemmas
If the Morphology class tries to lemmatize a word that's not in the string store, it's forced to just return it as-is. While loading exceptions, the class could hit a case where these strings weren't in the string store yet. The resulting lemmas could then be cached, leading to some words receiving upper-case lemmas. Closes #3551.
This commit is contained in:
parent
cc1516ec26
commit
d59b2e8a0c
|
@ -109,6 +109,7 @@ cdef class Morphology:
|
|||
analysis.tag = rich_tag
|
||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||
self.tag_map.get(tag_str, {}))
|
||||
|
||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||
if token.lemma == 0:
|
||||
token.lemma = analysis.lemma
|
||||
|
@ -140,7 +141,7 @@ cdef class Morphology:
|
|||
if tag not in self.reverse_index:
|
||||
return
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
orth = self.strings.add(orth_str)
|
||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
||||
|
|
Loading…
Reference in New Issue
Block a user