mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix issue #3551: Upper case lemmas
If the Morphology class tries to lemmatize a word that's not in the string store, it's forced to just return it as-is. While loading exceptions, the class could hit a case where these strings weren't in the string store yet. The resulting lemmas could then be cached, leading to some words receiving upper-case lemmas. Closes #3551.
This commit is contained in:
parent
cc1516ec26
commit
d59b2e8a0c
|
@ -109,6 +109,7 @@ cdef class Morphology:
|
||||||
analysis.tag = rich_tag
|
analysis.tag = rich_tag
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||||
self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
|
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
if token.lemma == 0:
|
if token.lemma == 0:
|
||||||
token.lemma = analysis.lemma
|
token.lemma = analysis.lemma
|
||||||
|
@ -140,7 +141,7 @@ cdef class Morphology:
|
||||||
if tag not in self.reverse_index:
|
if tag not in self.reverse_index:
|
||||||
return
|
return
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings.add(orth_str)
|
||||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user