mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix lemmatizer ordering, re Issue #1387
This commit is contained in:
parent
a0db2e0077
commit
4eb3405df7
|
@ -93,7 +93,6 @@ def lemmatize(string, index, exceptions, rules):
|
|||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
|
@ -104,8 +103,18 @@ def lemmatize(string, index, exceptions, rules):
|
|||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
# Remove duplicates, and sort forms generated by rules alphabetically.
|
||||
forms = list(set(forms))
|
||||
forms.sort()
|
||||
# Put exceptions at the front of the list, so they get priority.
|
||||
# This is a dodgy heuristic -- but it's the best we can do until we get
|
||||
# frequencies on this. We can at least prune out problematic exceptions,
|
||||
# if they shadow more frequent analyses.
|
||||
for form in exceptions.get(string, []):
|
||||
if form not in forms:
|
||||
forms.insert(0, form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(orig)
|
||||
return list(set(forms))
|
||||
return forms
|
||||
|
|
Loading…
Reference in New Issue
Block a user