mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 13:47:13 +03:00
951825532c
* Improved Dutch language resources and Dutch lemmatization * Fix conftest * Update punctuation.py * Auto-format * Format and fix tests * Remove unused test file * Re-add deleted test * removed redundant infix regex pattern for ','; note: brackets + simple hyphen remains * Cleaner lemmatization files
80 lines
1.4 KiB
Python
80 lines
1.4 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
ADJECTIVE_SUFFIX_RULES = [
|
|
["sten", ""],
|
|
["ste", ""],
|
|
["st", ""],
|
|
["er", ""],
|
|
["en", ""],
|
|
["e", ""],
|
|
["ende", "end"]
|
|
]
|
|
|
|
VERB_SUFFIX_RULES = [
|
|
["dt", "den"],
|
|
["de", "en"],
|
|
["te", "en"],
|
|
["dde", "den"],
|
|
["tte", "ten"],
|
|
["dden", "den"],
|
|
["tten", "ten"],
|
|
["end", "en"],
|
|
]
|
|
|
|
NOUN_SUFFIX_RULES = [
|
|
["en", ""],
|
|
["ën", ""],
|
|
["'er", ""],
|
|
["s", ""],
|
|
["tje", ""],
|
|
["kje", ""],
|
|
["'s", ""],
|
|
["ici", "icus"],
|
|
["heden", "heid"],
|
|
["elen", "eel"],
|
|
["ezen", "ees"],
|
|
["even", "eef"],
|
|
["ssen", "s"],
|
|
["rren", "r"],
|
|
["kken", "k"],
|
|
["bben", "b"]
|
|
]
|
|
|
|
NUM_SUFFIX_RULES = [
|
|
["ste", ""],
|
|
["sten", ""],
|
|
["ën", ""],
|
|
["en", ""],
|
|
["de", ""],
|
|
["er", ""],
|
|
["ër", ""],
|
|
["tjes", ""]
|
|
]
|
|
|
|
PUNCT_SUFFIX_RULES = [
|
|
["“", "\""],
|
|
["”", "\""],
|
|
["\u2018", "'"],
|
|
["\u2019", "'"]
|
|
]
|
|
|
|
|
|
# In-place sort guaranteeing that longer -- more specific -- rules are
|
|
# applied first.
|
|
for rule_set in (ADJECTIVE_SUFFIX_RULES,
|
|
NOUN_SUFFIX_RULES,
|
|
NUM_SUFFIX_RULES,
|
|
VERB_SUFFIX_RULES):
|
|
rule_set.sort(key=lambda r: len(r[0]), reverse=True)
|
|
|
|
|
|
RULES = {
|
|
"adj": ADJECTIVE_SUFFIX_RULES,
|
|
"noun": NOUN_SUFFIX_RULES,
|
|
"verb": VERB_SUFFIX_RULES,
|
|
"num": NUM_SUFFIX_RULES,
|
|
"punct": PUNCT_SUFFIX_RULES
|
|
}
|