fix: match hyphenated words to lemmas in index_table (e.g. "co-authored" -> "co-author") (#13816)

This commit is contained in:
d0ngw 2025-05-27 07:20:26 +08:00 committed by GitHub
parent b205ff65e6
commit 46613e27cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -218,7 +218,10 @@ class Lemmatizer(Pipe):
if not form: if not form:
pass pass
elif form in index or not form.isalpha(): elif form in index or not form.isalpha():
forms.append(form) if form in index:
forms.insert(0, form)
else:
forms.append(form)
else: else:
oov_forms.append(form) oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules" # Remove duplicates but preserve the ordering of applied "rules"