Fix lemmatizer ordering, re Issue #1387

This commit is contained in:
Matthew Honnibal 2018-07-05 13:49:29 +02:00
parent a0db2e0077
commit 4eb3405df7

View File

@ -93,7 +93,6 @@ def lemmatize(string, index, exceptions, rules):
orig = string orig = string
string = string.lower() string = string.lower()
forms = [] forms = []
forms.extend(exceptions.get(string, []))
oov_forms = [] oov_forms = []
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):
@ -104,8 +103,18 @@ def lemmatize(string, index, exceptions, rules):
forms.append(form) forms.append(form)
else: else:
oov_forms.append(form) oov_forms.append(form)
# Remove duplicates, and sort forms generated by rules alphabetically.
forms = list(set(forms))
forms.sort()
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,
# if they shadow more frequent analyses.
for form in exceptions.get(string, []):
if form not in forms:
forms.insert(0, form)
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms: if not forms:
forms.append(orig) forms.append(orig)
return list(set(forms)) return forms