mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Move lookup tables out of the core library (#4346)
* Add default to util.get_entry_point * Tidy up entry points * Read lookups from entry points * Remove lookup tables and related tests * Add lookups install option * Remove lemmatizer tests * Remove logic to process language data files * Update setup.cfg
This commit is contained in:
parent
ed620daa5c
commit
e0cf4796a5
|
@ -46,6 +46,8 @@ install_requires =
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
|
lookups =
|
||||||
|
spacy_lookups_data>=0.0.4<0.2.0
|
||||||
cuda =
|
cuda =
|
||||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||||
cupy>=5.0.0b4
|
cupy>=5.0.0b4
|
||||||
|
|
18
setup.py
18
setup.py
|
@ -115,23 +115,6 @@ def generate_cython(root, source):
|
||||||
raise RuntimeError("Running cythonize failed")
|
raise RuntimeError("Running cythonize failed")
|
||||||
|
|
||||||
|
|
||||||
def gzip_language_data(root, source):
|
|
||||||
print("Compressing language data")
|
|
||||||
import srsly
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
base = Path(root) / source
|
|
||||||
for jsonfile in base.glob("**/*.json"):
|
|
||||||
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
|
|
||||||
if outfile.is_file() and outfile.stat().st_mtime > jsonfile.stat().st_mtime:
|
|
||||||
# If the gz is newer it doesn't need updating
|
|
||||||
print("Skipping {}, already compressed".format(jsonfile))
|
|
||||||
continue
|
|
||||||
data = srsly.read_json(jsonfile)
|
|
||||||
srsly.write_gzip_json(outfile, data)
|
|
||||||
print("Compressed {}".format(jsonfile))
|
|
||||||
|
|
||||||
|
|
||||||
def is_source_release(path):
|
def is_source_release(path):
|
||||||
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
||||||
|
|
||||||
|
@ -203,7 +186,6 @@ def setup_package():
|
||||||
|
|
||||||
if not is_source_release(root):
|
if not is_source_release(root):
|
||||||
generate_cython(root, "spacy")
|
generate_cython(root, "spacy")
|
||||||
gzip_language_data(root, "spacy/lang")
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="spacy",
|
name="spacy",
|
||||||
|
|
|
@ -5,7 +5,7 @@ import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from ..util import minify_html, escape_html, get_entry_points
|
from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
@ -242,7 +242,7 @@ class EntityRenderer(object):
|
||||||
"CARDINAL": "#e4e7d2",
|
"CARDINAL": "#e4e7d2",
|
||||||
"PERCENT": "#e4e7d2",
|
"PERCENT": "#e4e7d2",
|
||||||
}
|
}
|
||||||
user_colors = get_entry_points("spacy_displacy_colors")
|
user_colors = get_entry_points(ENTRY_POINTS.displacy_colors)
|
||||||
for user_color in user_colors.values():
|
for user_color in user_colors.values():
|
||||||
colors.update(user_color)
|
colors.update(user_color)
|
||||||
colors.update(options.get("colors", {}))
|
colors.update(options.get("colors", {}))
|
||||||
|
|
|
@ -21,8 +21,6 @@ class BengaliDefaults(Language.Defaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
# Lemma rules: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
|
|
||||||
resources = {"lemma_rules": "lemma_rules.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
{
|
|
||||||
"noun": [
|
|
||||||
["টা", ""],
|
|
||||||
["টি", ""],
|
|
||||||
["খান", ""],
|
|
||||||
["খানা", ""],
|
|
||||||
["খানি", ""],
|
|
||||||
["গাছা", ""],
|
|
||||||
["গাছি", ""],
|
|
||||||
["ছড়া", ""],
|
|
||||||
["কে", ""],
|
|
||||||
["ে", ""],
|
|
||||||
["তে", ""],
|
|
||||||
["র", ""],
|
|
||||||
["রা", ""],
|
|
||||||
["রে", ""],
|
|
||||||
["ের", ""],
|
|
||||||
["েরা", ""],
|
|
||||||
["দের", ""],
|
|
||||||
["দেরকে", ""],
|
|
||||||
["গুলা", ""],
|
|
||||||
["গুলো", ""],
|
|
||||||
["গুলি", ""],
|
|
||||||
["কুল", ""],
|
|
||||||
["গণ", ""],
|
|
||||||
["দল", ""],
|
|
||||||
["পাল", ""],
|
|
||||||
["পুঞ্জ", ""],
|
|
||||||
["মণ্ডলী", ""],
|
|
||||||
["মালা", ""],
|
|
||||||
["রাজি", ""],
|
|
||||||
["বৃন্দ", ""],
|
|
||||||
["বর্গ", ""],
|
|
||||||
["শ্রেণী", ""],
|
|
||||||
["শ্রেনি", ""],
|
|
||||||
["রাশি", ""],
|
|
||||||
["সকল", ""],
|
|
||||||
["মহল", ""],
|
|
||||||
["াবলি", ""],
|
|
||||||
["০", "0"],
|
|
||||||
["১", "1"],
|
|
||||||
["২", "2"],
|
|
||||||
["৩", "3"],
|
|
||||||
["৪", "4"],
|
|
||||||
["৫", "5"],
|
|
||||||
["৬", "6"],
|
|
||||||
["৭", "7"],
|
|
||||||
["৮", "8"],
|
|
||||||
["৯", "9"]
|
|
||||||
],
|
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["‘", "'"],
|
|
||||||
["’", "'"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -24,7 +24,6 @@ class CatalanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -29,7 +29,6 @@ class DanishDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Danish(Language):
|
class Danish(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -26,7 +26,6 @@ class GermanDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
single_orth_variants = [
|
single_orth_variants = [
|
||||||
{"tags": ["$("], "variants": ["…", "..."]},
|
{"tags": ["$("], "variants": ["…", "..."]},
|
||||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
|
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -31,11 +31,6 @@ class GreekDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {
|
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ....symbols import NOUN, VERB, ADJ, PUNCT
|
from ...symbols import NOUN, VERB, ADJ, PUNCT
|
||||||
|
|
||||||
|
|
||||||
class GreekLemmatizer(object):
|
class GreekLemmatizer(object):
|
|
@ -1,236 +0,0 @@
|
||||||
{
|
|
||||||
"adj": {
|
|
||||||
"χειρότερος": ["κακός"],
|
|
||||||
"χειρότερη": ["κακός"],
|
|
||||||
"χειρότερης": ["κακός"],
|
|
||||||
"χειρότερο": ["κακός"],
|
|
||||||
"χειρότεροι": ["κακός"],
|
|
||||||
"χειρότερων": ["κακός"],
|
|
||||||
"χειρότερου": ["κακός"],
|
|
||||||
"βέλτιστος": ["καλός"],
|
|
||||||
"βέλτιστη": ["καλός"],
|
|
||||||
"βέλτιστης": ["καλός"],
|
|
||||||
"βέλτιστο": ["καλός"],
|
|
||||||
"βέλτιστοι": ["καλός"],
|
|
||||||
"βέλτιστων": ["καλός"],
|
|
||||||
"βέλτιστου": ["καλός"],
|
|
||||||
"ελάχιστος": ["λίγος"],
|
|
||||||
"ελάχιστα": ["λίγος"],
|
|
||||||
"ελάχιστοι": ["λίγος"],
|
|
||||||
"ελάχιστων": ["λίγος"],
|
|
||||||
"ελάχιστη": ["λίγος"],
|
|
||||||
"ελάχιστης": ["λίγος"],
|
|
||||||
"ελάχιστο": ["λίγος"],
|
|
||||||
"ελάχιστου": ["λίγος"],
|
|
||||||
"πλείστος": ["πολύς"],
|
|
||||||
"πλείστου": ["πολύς"],
|
|
||||||
"πλείστων": ["πολύς"],
|
|
||||||
"πολλή": ["πολύ"],
|
|
||||||
"πολύς": ["πολύ"],
|
|
||||||
"πολλύ": ["πολύ"],
|
|
||||||
"πολλύς": ["πολύ"]
|
|
||||||
},
|
|
||||||
"noun": {
|
|
||||||
"λευτεριά": ["ελευθερία"],
|
|
||||||
"καφέδες": ["καφές"],
|
|
||||||
"ποιήματα": ["ποίημα"]
|
|
||||||
},
|
|
||||||
"det": {
|
|
||||||
"του": ["το"],
|
|
||||||
"των": ["το"],
|
|
||||||
"τους": ["το"],
|
|
||||||
"τις": ["τη"],
|
|
||||||
"τα": ["το"],
|
|
||||||
"οι": ["ο", "η"]
|
|
||||||
},
|
|
||||||
"verb": {
|
|
||||||
"είσαι": ["είμαι"],
|
|
||||||
"είναι": ["είμαι"],
|
|
||||||
"είμαστε": ["είμαι"],
|
|
||||||
"είστε": ["είμαι"],
|
|
||||||
"είσαστε": ["είμαι"],
|
|
||||||
"ήμουν": ["είμαι"],
|
|
||||||
"ήσουν": ["είμαι"],
|
|
||||||
"ήταν": ["είμαι"],
|
|
||||||
"ήμαστε": ["είμαι"],
|
|
||||||
"ήμασταν": ["είμαι"],
|
|
||||||
"είπα": ["λέω"],
|
|
||||||
"είπες": ["λέω"],
|
|
||||||
"είπε": ["λέω"],
|
|
||||||
"είπαμε": ["λέω"],
|
|
||||||
"είπατε": ["λέω"],
|
|
||||||
"είπαν": ["λέω"],
|
|
||||||
"είπανε": ["λέω"],
|
|
||||||
"πει": ["λέω"],
|
|
||||||
"πω": ["λέω"],
|
|
||||||
"πάω": ["πηγαίνω"],
|
|
||||||
"πάς": ["πηγαίνω"],
|
|
||||||
"πας": ["πηγαίνω"],
|
|
||||||
"πάει": ["πηγαίνω"],
|
|
||||||
"πάμε": ["πηγαίνω"],
|
|
||||||
"πάτε": ["πηγαίνω"],
|
|
||||||
"πάνε": ["πηγαίνω"],
|
|
||||||
"πήγα": ["πηγαίνω"],
|
|
||||||
"πήγες": ["πηγαίνω"],
|
|
||||||
"πήγε": ["πηγαίνω"],
|
|
||||||
"πήγαμε": ["πηγαίνω"],
|
|
||||||
"πήγατε": ["πηγαίνω"],
|
|
||||||
"πήγαν": ["πηγαίνω"],
|
|
||||||
"πήγανε": ["πηγαίνω"],
|
|
||||||
"έπαιζα": ["παίζω"],
|
|
||||||
"έπαιζες": ["παίζω"],
|
|
||||||
"έπαιζε": ["παίζω"],
|
|
||||||
"έπαιζαν": ["παίζω,"],
|
|
||||||
"έπαιξα": ["παίζω"],
|
|
||||||
"έπαιξες": ["παίζω"],
|
|
||||||
"έπαιξε": ["παίζω"],
|
|
||||||
"έτρωγα": ["τρώω"],
|
|
||||||
"έτρωγε": ["τρώω"],
|
|
||||||
"είχα": ["έχω"],
|
|
||||||
"είχες": ["έχω"],
|
|
||||||
"είχε": ["έχω"],
|
|
||||||
"είχαμε": ["έχω"],
|
|
||||||
"είχατε": ["έχω"],
|
|
||||||
"είχαν": ["έχω"],
|
|
||||||
"είχανε": ["έχω"],
|
|
||||||
"έπαιρνα": ["παίρνω"],
|
|
||||||
"έπαιρνες": ["παίρνω"],
|
|
||||||
"έπαιρνε": ["παίρνω"],
|
|
||||||
"έπαιρναν": ["παίρνω"],
|
|
||||||
"εδίνα": ["δίνω"],
|
|
||||||
"εδίνες": ["δίνω"],
|
|
||||||
"εδίνε": ["δίνω"],
|
|
||||||
"εδίναν": ["δίνω"],
|
|
||||||
"έκανα": ["κάνω"],
|
|
||||||
"έκανες": ["κάνω"],
|
|
||||||
"έκανε": ["κάνω"],
|
|
||||||
"έκαναν": ["κάνω"],
|
|
||||||
"ήθελα": ["θέλω"],
|
|
||||||
"ήθελες": ["θέλω"],
|
|
||||||
"ήθελε": ["θέλω"],
|
|
||||||
"ήθελαν": ["θέλω"],
|
|
||||||
"έβλεπα": ["βλέπω"],
|
|
||||||
"έβλεπες": ["βλέπω"],
|
|
||||||
"έβλεπε": ["βλέπω"],
|
|
||||||
"έβλεπαν": ["βλέπω"],
|
|
||||||
"είδα": ["βλέπω"],
|
|
||||||
"είδες": ["βλέπω"],
|
|
||||||
"είδε": ["βλέπω"],
|
|
||||||
"είδαμε": ["βλέπω"],
|
|
||||||
"είδατε": ["βλέπω"],
|
|
||||||
"είδαν": ["βλέπω"],
|
|
||||||
"έφερνα": ["φέρνω"],
|
|
||||||
"έφερνες": ["φέρνω"],
|
|
||||||
"έφερνε": ["φέρνω"],
|
|
||||||
"έφερναν": ["φέρνω"],
|
|
||||||
"έφερα": ["φέρω"],
|
|
||||||
"έφερες": ["φέρω"],
|
|
||||||
"έφερε": ["φέρω"],
|
|
||||||
"έφεραν": ["φέρω"],
|
|
||||||
"έλαβα": ["λαμβάνω"],
|
|
||||||
"έλαβες": ["λαμβάνω"],
|
|
||||||
"έλαβε": ["λαμβάνω"],
|
|
||||||
"έλαβαν": ["λαμβάνω"],
|
|
||||||
"έβρισκα": ["βρίσκω"],
|
|
||||||
"έβρισκες": ["βρίσκω"],
|
|
||||||
"έβρισκε": ["βρίσκω"],
|
|
||||||
"έβρισκαν": ["βρίσκω"],
|
|
||||||
"ήξερα": ["ξέρω"],
|
|
||||||
"ήξερες": ["ξέρω"],
|
|
||||||
"ήξερε": ["ξέρω"],
|
|
||||||
"ήξεραν": ["ξέρω"],
|
|
||||||
"ανέφερα": ["αναφέρω"],
|
|
||||||
"ανέφερες": ["αναφέρω"],
|
|
||||||
"ανέφερε": ["αναφέρω"],
|
|
||||||
"ανέφεραν": ["αναφέρω"],
|
|
||||||
"έβαζα": ["βάζω"],
|
|
||||||
"έβαζες": ["βάζω"],
|
|
||||||
"έβαζε": ["βάζω"],
|
|
||||||
"έβαζαν": ["βάζω"],
|
|
||||||
"έμεινα": ["μένω"],
|
|
||||||
"έμεινες": ["μένω"],
|
|
||||||
"έμεινε": ["μένω"],
|
|
||||||
"έμειναν": ["μένω"],
|
|
||||||
"έβγαζα": ["βγάζω"],
|
|
||||||
"έβγαζες": ["βγάζω"],
|
|
||||||
"έβγαζε": ["βγάζω"],
|
|
||||||
"έβγαζαν": ["βγάζω"],
|
|
||||||
"έμπαινα": ["μπαίνω"],
|
|
||||||
"έμπαινες": ["μπαίνω"],
|
|
||||||
"έμπαινε": ["μπαίνω"],
|
|
||||||
"έμπαιναν": ["μπαίνω"],
|
|
||||||
"βγήκα": ["βγαίνω"],
|
|
||||||
"βγήκες": ["βγαίνω"],
|
|
||||||
"βγήκε": ["βγαίνω"],
|
|
||||||
"βγήκαμε": ["βγαίνω"],
|
|
||||||
"βγήκατε": ["βγαίνω"],
|
|
||||||
"βγήκαν": ["βγαίνω"],
|
|
||||||
"έπεφτα": ["πέφτω"],
|
|
||||||
"έπεφτες": ["πέφτω"],
|
|
||||||
"έπεφτε": ["πέφτω"],
|
|
||||||
"έπεφταν": ["πέφτω"],
|
|
||||||
"έπεσα": ["πέφτω"],
|
|
||||||
"έπεσες": ["πέφτω"],
|
|
||||||
"έπεσε": ["πέφτω"],
|
|
||||||
"έπεσαν": ["πέφτω"],
|
|
||||||
"έστειλα": ["στέλνω"],
|
|
||||||
"έστειλες": ["στέλνω"],
|
|
||||||
"έστειλε": ["στέλνω"],
|
|
||||||
"έστειλαν": ["στέλνω"],
|
|
||||||
"έφυγα": ["φεύγω"],
|
|
||||||
"έφυγες": ["φεύγω"],
|
|
||||||
"έφυγαν": ["φεύγω"],
|
|
||||||
"έμαθα": ["μαθαίνω"],
|
|
||||||
"έμαθες": ["μαθαίνω"],
|
|
||||||
"έμαθε": ["μαθαίνω"],
|
|
||||||
"έμαθαν": ["μαθαίνω"],
|
|
||||||
"υπέβαλλα": ["υποβάλλω"],
|
|
||||||
"υπέβαλλες": ["υποβάλλω"],
|
|
||||||
"υπέβαλλε": ["υποβάλλω"],
|
|
||||||
"υπέβαλλαν": ["υποβάλλω"],
|
|
||||||
"έπινα": ["πίνω"],
|
|
||||||
"έπινες": ["πίνω"],
|
|
||||||
"έπινε": ["πίνω"],
|
|
||||||
"έπιναν": ["πίνω"],
|
|
||||||
"ήπια": ["πίνω"],
|
|
||||||
"ήπιες": ["πίνω"],
|
|
||||||
"ήπιε": ["πίνω"],
|
|
||||||
"ήπιαμε": ["πίνω"],
|
|
||||||
"ήπιατε": ["πίνω"],
|
|
||||||
"ήπιαν": ["πίνω"],
|
|
||||||
"ετύχα": ["τυχαίνω"],
|
|
||||||
"ετύχες": ["τυχαίνω"],
|
|
||||||
"ετύχε": ["τυχαίνω"],
|
|
||||||
"ετύχαν": ["τυχαίνω"],
|
|
||||||
"φάω": ["τρώω"],
|
|
||||||
"φάς": ["τρώω"],
|
|
||||||
"φάει": ["τρώω"],
|
|
||||||
"φάμε": ["τρώω"],
|
|
||||||
"φάτε": ["τρώω"],
|
|
||||||
"φάνε": ["τρώω"],
|
|
||||||
"φάν": ["τρώω"],
|
|
||||||
"έτρωγες": ["τρώω"],
|
|
||||||
"τρώγαμε": ["τρώω"],
|
|
||||||
"τρώγατε": ["τρώω"],
|
|
||||||
"τρώγανε": ["τρώω"],
|
|
||||||
"τρώγαν": ["τρώω"],
|
|
||||||
"πέρασα": ["περνώ"],
|
|
||||||
"πέρασες": ["περνώ"],
|
|
||||||
"πέρασε": ["περνώ"],
|
|
||||||
"πέρασαμε": ["περνώ"],
|
|
||||||
"πέρασατε": ["περνώ"],
|
|
||||||
"πέρασαν": ["περνώ"],
|
|
||||||
"έγδαρα": ["γδάρω"],
|
|
||||||
"έγδαρες": ["γδάρω"],
|
|
||||||
"έγδαρε": ["γδάρω"],
|
|
||||||
"έγδαραν": ["γδάρω"],
|
|
||||||
"έβγαλα": ["βγάλω"],
|
|
||||||
"έβγαλες": ["βγάλω"],
|
|
||||||
"έβγαλε": ["βγάλω"],
|
|
||||||
"έβγαλαν": ["βγάλω"],
|
|
||||||
"έφθασα": ["φτάνω"],
|
|
||||||
"έφθασες": ["φτάνω"],
|
|
||||||
"έφθασε": ["φτάνω"],
|
|
||||||
"έφθασαν": ["φτάνω"]
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
|
@ -1,139 +0,0 @@
|
||||||
{
|
|
||||||
"adj": [
|
|
||||||
["οί", "ός"],
|
|
||||||
["ών", "ός"],
|
|
||||||
["ού", "ός"],
|
|
||||||
["ή", "ός"],
|
|
||||||
["ής", "ός"],
|
|
||||||
["ές", "ός"],
|
|
||||||
["οι", "ος"],
|
|
||||||
["ων", "ος"],
|
|
||||||
["ου", "ος"],
|
|
||||||
["ο", "ος"],
|
|
||||||
["α", "ος"],
|
|
||||||
["ώδη", "ώδες"],
|
|
||||||
["ύτερη", "ός"],
|
|
||||||
["ύτερης", "ός"],
|
|
||||||
["ύτερων", "ός"],
|
|
||||||
["ύτερος", "ός"],
|
|
||||||
["ύτερου", "ός"]
|
|
||||||
],
|
|
||||||
"noun": [
|
|
||||||
["ιού", "ί"],
|
|
||||||
["ιά", "ί"],
|
|
||||||
["ιών", "ί"],
|
|
||||||
["ηριού", "ήρι"],
|
|
||||||
["ια", "ι"],
|
|
||||||
["ηριών", "ήρι"],
|
|
||||||
["ας", "α"],
|
|
||||||
["ες", "α"],
|
|
||||||
["ων", "α"],
|
|
||||||
["άς", "ά"],
|
|
||||||
["ές", "ά"],
|
|
||||||
["ών", "ά"],
|
|
||||||
["ής", "ή"],
|
|
||||||
["ές", "ή"],
|
|
||||||
["ών", "ή"],
|
|
||||||
["ές", "ής"],
|
|
||||||
["ών", "ής"],
|
|
||||||
["ου", "ο"],
|
|
||||||
["α", "ο"],
|
|
||||||
["ων", "ο"],
|
|
||||||
["ητήματος", "ήτημα"],
|
|
||||||
["ητήματα", "ήτημα"],
|
|
||||||
["ητημάτων", "ήτημα"],
|
|
||||||
["τος", ""],
|
|
||||||
["τα", "α"],
|
|
||||||
["ομάτων", "όμα"],
|
|
||||||
["ού", "ός"],
|
|
||||||
["οί", "ός"],
|
|
||||||
["ών", "ός"],
|
|
||||||
["ς", ""],
|
|
||||||
["ες", "α"],
|
|
||||||
["ιών", "ία"],
|
|
||||||
["α", "ας"],
|
|
||||||
["δων", ""]
|
|
||||||
],
|
|
||||||
"verb": [
|
|
||||||
["εις", "ω"],
|
|
||||||
["ει", "ω"],
|
|
||||||
["ουμε", "ω"],
|
|
||||||
["ετε", "ω"],
|
|
||||||
["ουνε", "ω"],
|
|
||||||
["ουν", "ω"],
|
|
||||||
["είς", "ώ"],
|
|
||||||
["εί", "ώ"],
|
|
||||||
["ούν", "ώ"],
|
|
||||||
["εσαι", "ομαι"],
|
|
||||||
["εται", "ομαι"],
|
|
||||||
["ανόμαστε", "άνομαι"],
|
|
||||||
["εστε", "ομαι"],
|
|
||||||
["ονται", "ομαι"],
|
|
||||||
["άς", "ώ"],
|
|
||||||
["άει", "ώ"],
|
|
||||||
["άμε", "ώ"],
|
|
||||||
["άτε", "ώ"],
|
|
||||||
["άνε", "ώ"],
|
|
||||||
["άν", "ώ"],
|
|
||||||
["άω", "ώ"],
|
|
||||||
["ώ", "άω"],
|
|
||||||
["ιζόμουν", "ίζομαι"],
|
|
||||||
["ιζόσουν", "ίζομαι"],
|
|
||||||
["ιζόταν", "ίζομαι"],
|
|
||||||
["ιζόμασταν", "ίζομαι"],
|
|
||||||
["ιζόσασταν", "ίζομαι"],
|
|
||||||
["ονταν", "ομαι"],
|
|
||||||
["όμουν", "άμαι"],
|
|
||||||
["όσουν", "άμαι"],
|
|
||||||
["όταν", "άμαι"],
|
|
||||||
["όμασταν", "άμαι"],
|
|
||||||
["όσασταν", "άμαι"],
|
|
||||||
["όντουσταν", "άμαι"],
|
|
||||||
["ούσα", "ώ"],
|
|
||||||
["ούσες", "ώ"],
|
|
||||||
["ούσε", "ώ"],
|
|
||||||
["ούσαμε", "ώ"],
|
|
||||||
["ούσατε", "ώ"],
|
|
||||||
["ούσαν", "ώ"],
|
|
||||||
["ούσανε", "ώ"],
|
|
||||||
["λαμε", "ζω"],
|
|
||||||
["λατε", "ζω"],
|
|
||||||
["ήρα", "άρω"],
|
|
||||||
["ήρες", "άρω"],
|
|
||||||
["ήρε", "άρω"],
|
|
||||||
["ήραμε", "άρω"],
|
|
||||||
["ήρατε", "άρω"],
|
|
||||||
["ήρα", "άρω"],
|
|
||||||
["ένησα", "ενώ"],
|
|
||||||
["ένησες", "ενώ"],
|
|
||||||
["ένησε", "ενώ"],
|
|
||||||
["ενήσαμε", "ενώ"],
|
|
||||||
["ένησατε", "ενώ"],
|
|
||||||
["ένησαν", "ενώ"],
|
|
||||||
["όνεσα", "ονώ"],
|
|
||||||
["όνεσες", "ονώ"],
|
|
||||||
["όνεσε", "ονώ"],
|
|
||||||
["έσαμε", "ώ"],
|
|
||||||
["έσατε", "ώ"],
|
|
||||||
["ισα", "ομαι"],
|
|
||||||
["ισες", "ομαι"],
|
|
||||||
["ισε", "ομαι"],
|
|
||||||
["αθίσαμε", "άθομαι"],
|
|
||||||
["αθίσατε", "άθομαι"],
|
|
||||||
["ισαν", "ομαι"],
|
|
||||||
["άπα", "απώ"],
|
|
||||||
["ά", "ώ"],
|
|
||||||
["οντας", "ω"],
|
|
||||||
["ξω", "ζω"],
|
|
||||||
["ξεις", "ζω"],
|
|
||||||
["ξουμε", "ζω"],
|
|
||||||
["ξετε", "ζω"],
|
|
||||||
["ξουν", "ζω"]
|
|
||||||
],
|
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["‘", "'"],
|
|
||||||
["’", "'"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -32,12 +32,6 @@ class EnglishDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {
|
|
||||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
|
||||||
}
|
|
||||||
single_orth_variants = [
|
single_orth_variants = [
|
||||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||||
|
|
|
@ -1,31 +0,0 @@
|
||||||
WordNet Release 3.0
|
|
||||||
|
|
||||||
This software and database is being provided to you, the LICENSEE, by
|
|
||||||
Princeton University under the following license. By obtaining, using
|
|
||||||
and/or copying this software and database, you agree that you have
|
|
||||||
read, understood, and will comply with these terms and conditions.:
|
|
||||||
|
|
||||||
Permission to use, copy, modify and distribute this software and
|
|
||||||
database and its documentation for any purpose and without fee or
|
|
||||||
royalty is hereby granted, provided that you agree to comply with
|
|
||||||
the following copyright notice and statements, including the disclaimer,
|
|
||||||
and that the same appear on ALL copies of the software, database and
|
|
||||||
documentation, including modifications that you make for internal
|
|
||||||
use or for distribution.
|
|
||||||
|
|
||||||
WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
|
|
||||||
|
|
||||||
THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
|
||||||
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
|
||||||
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
|
||||||
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
|
||||||
ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
|
|
||||||
OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
|
|
||||||
INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
|
|
||||||
OTHER RIGHTS.
|
|
||||||
|
|
||||||
The name of Princeton University or Princeton may not be used in
|
|
||||||
advertising or publicity pertaining to distribution of the software
|
|
||||||
and/or database. Title to copyright in this software, database and
|
|
||||||
any associated documentation shall at all times remain with
|
|
||||||
Princeton University and LICENSEE agrees to preserve same.
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,35 +0,0 @@
|
||||||
{
|
|
||||||
"adj": [
|
|
||||||
["er", ""],
|
|
||||||
["est", ""],
|
|
||||||
["er", "e"],
|
|
||||||
["est", "e"]
|
|
||||||
],
|
|
||||||
"noun": [
|
|
||||||
["s", ""],
|
|
||||||
["ses", "s"],
|
|
||||||
["ves", "f"],
|
|
||||||
["xes", "x"],
|
|
||||||
["zes", "z"],
|
|
||||||
["ches", "ch"],
|
|
||||||
["shes", "sh"],
|
|
||||||
["men", "man"],
|
|
||||||
["ies", "y"]
|
|
||||||
],
|
|
||||||
"verb": [
|
|
||||||
["s", ""],
|
|
||||||
["ies", "y"],
|
|
||||||
["es", "e"],
|
|
||||||
["es", ""],
|
|
||||||
["ed", "e"],
|
|
||||||
["ed", ""],
|
|
||||||
["ing", "e"],
|
|
||||||
["ing", ""]
|
|
||||||
],
|
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["‘", "'"],
|
|
||||||
["’", "'"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -25,7 +25,6 @@ class SpanishDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,12 +24,6 @@ class PersianDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
# extracted from Mojgan Seraji's Persian Universal Dependencies Corpus
|
|
||||||
resources = {
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,41 +0,0 @@
|
||||||
{
|
|
||||||
"adj": [
|
|
||||||
["ین", ""],
|
|
||||||
["ترین", ""],
|
|
||||||
["ترین", ""],
|
|
||||||
["تر", ""],
|
|
||||||
["تر", ""],
|
|
||||||
["ای", ""]
|
|
||||||
],
|
|
||||||
"noun": [
|
|
||||||
["ایان", "ا"],
|
|
||||||
["ویان", "و"],
|
|
||||||
["ایانی", "ا"],
|
|
||||||
["ویانی", "و"],
|
|
||||||
["گان", "ه"],
|
|
||||||
["گانی", "ه"],
|
|
||||||
["گان", ""],
|
|
||||||
["گانی", ""],
|
|
||||||
["ان", ""],
|
|
||||||
["انی", ""],
|
|
||||||
["ات", ""],
|
|
||||||
["ات", "ه"],
|
|
||||||
["ات", "ت"],
|
|
||||||
["اتی", ""],
|
|
||||||
["اتی", "ه"],
|
|
||||||
["اتی", "ت"],
|
|
||||||
["ها", ""],
|
|
||||||
["ها", ""],
|
|
||||||
["های", ""],
|
|
||||||
["های", ""],
|
|
||||||
["هایی", ""],
|
|
||||||
["هایی", ""]
|
|
||||||
],
|
|
||||||
"verb": [],
|
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["‘", "'"],
|
|
||||||
["’", "'"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -30,12 +30,6 @@ class FrenchDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
|
||||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||||
from ....symbols import SCONJ, CCONJ
|
from ...symbols import SCONJ, CCONJ
|
||||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
|
|
||||||
|
|
||||||
class FrenchLemmatizer(object):
|
class FrenchLemmatizer(object):
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,126 +0,0 @@
|
||||||
{
|
|
||||||
"adj": [
|
|
||||||
["a", "a"],
|
|
||||||
["aux", "al"],
|
|
||||||
["c", "c"],
|
|
||||||
["d", "d"],
|
|
||||||
["e", ""],
|
|
||||||
["é", "é"],
|
|
||||||
["eux", "eux"],
|
|
||||||
["f", "f"],
|
|
||||||
["i", "i"],
|
|
||||||
["ï", "ï"],
|
|
||||||
["l", "l"],
|
|
||||||
["m", "m"],
|
|
||||||
["n", "n"],
|
|
||||||
["o", "o"],
|
|
||||||
["p", "p"],
|
|
||||||
["r", "r"],
|
|
||||||
["s", ""],
|
|
||||||
["t", "t"],
|
|
||||||
["u", "u"],
|
|
||||||
["y", "y"]
|
|
||||||
],
|
|
||||||
"noun": [
|
|
||||||
["a", "a"],
|
|
||||||
["à", "à"],
|
|
||||||
["â", "â"],
|
|
||||||
["b", "b"],
|
|
||||||
["c", "c"],
|
|
||||||
["ç", "ç"],
|
|
||||||
["d", "d"],
|
|
||||||
["e", "e"],
|
|
||||||
["é", "é"],
|
|
||||||
["è", "è"],
|
|
||||||
["ê", "ê"],
|
|
||||||
["ë", "ë"],
|
|
||||||
["f", "f"],
|
|
||||||
["g", "g"],
|
|
||||||
["h", "h"],
|
|
||||||
["i", "i"],
|
|
||||||
["î", "î"],
|
|
||||||
["ï", "ï"],
|
|
||||||
["j", "j"],
|
|
||||||
["k", "k"],
|
|
||||||
["l", "l"],
|
|
||||||
["m", "m"],
|
|
||||||
["n", "n"],
|
|
||||||
["o", "o"],
|
|
||||||
["ô", "ö"],
|
|
||||||
["ö", "ö"],
|
|
||||||
["p", "p"],
|
|
||||||
["q", "q"],
|
|
||||||
["r", "r"],
|
|
||||||
["t", "t"],
|
|
||||||
["u", "u"],
|
|
||||||
["û", "û"],
|
|
||||||
["v", "v"],
|
|
||||||
["w", "w"],
|
|
||||||
["y", "y"],
|
|
||||||
["z", "z"],
|
|
||||||
["s", ""],
|
|
||||||
["x", ""],
|
|
||||||
["nt(e", "nt"],
|
|
||||||
["nt(e)", "nt"],
|
|
||||||
["al(e", "ale"],
|
|
||||||
["é(", "é"],
|
|
||||||
["é(e", "é"],
|
|
||||||
["é.e", "é"],
|
|
||||||
["el(le", "el"],
|
|
||||||
["eurs(rices", "eur"],
|
|
||||||
["eur(rice", "eur"],
|
|
||||||
["eux(se", "eux"],
|
|
||||||
["ial(e", "ial"],
|
|
||||||
["er(ère", "er"],
|
|
||||||
["eur(se", "eur"],
|
|
||||||
["teur(trice", "teur"],
|
|
||||||
["teurs(trices", "teur"]
|
|
||||||
],
|
|
||||||
"verb": [
|
|
||||||
["é", "er"],
|
|
||||||
["és", "er"],
|
|
||||||
["ée", "er"],
|
|
||||||
["ées", "er"],
|
|
||||||
["é", "er"],
|
|
||||||
["es", "er"],
|
|
||||||
["ons", "er"],
|
|
||||||
["ez", "er"],
|
|
||||||
["ent", "er"],
|
|
||||||
["ais", "er"],
|
|
||||||
["ait", "er"],
|
|
||||||
["ions", "er"],
|
|
||||||
["iez", "er"],
|
|
||||||
["aient", "er"],
|
|
||||||
["ai", "er"],
|
|
||||||
["as", "er"],
|
|
||||||
["a", "er"],
|
|
||||||
["âmes", "er"],
|
|
||||||
["âtes", "er"],
|
|
||||||
["èrent", "er"],
|
|
||||||
["erai", "er"],
|
|
||||||
["eras", "er"],
|
|
||||||
["era", "er"],
|
|
||||||
["erons", "er"],
|
|
||||||
["erez", "er"],
|
|
||||||
["eront", "er"],
|
|
||||||
["erais", "er"],
|
|
||||||
["erait", "er"],
|
|
||||||
["erions", "er"],
|
|
||||||
["eriez", "er"],
|
|
||||||
["eraient", "er"],
|
|
||||||
["asse", "er"],
|
|
||||||
["asses", "er"],
|
|
||||||
["ât", "er"],
|
|
||||||
["assions", "er"],
|
|
||||||
["assiez", "er"],
|
|
||||||
["assent", "er"],
|
|
||||||
["ant", "er"],
|
|
||||||
["ante", "er"],
|
|
||||||
["ants", "er"],
|
|
||||||
["antes", "er"],
|
|
||||||
["u(er", "u"],
|
|
||||||
["és(ées", "er"],
|
|
||||||
["é()e", "er"],
|
|
||||||
["é()", "er"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -18,7 +18,6 @@ class CroatianDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Croatian(Language):
|
class Croatian(Language):
|
||||||
|
|
1313609
spacy/lang/hr/lemma_lookup.json
1313609
spacy/lang/hr/lemma_lookup.json
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,6 @@ class HungarianDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Hungarian(Language):
|
class Hungarian(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,7 +30,6 @@ class IndonesianDefaults(Language.Defaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Indonesian(Language):
|
class Indonesian(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,7 +23,6 @@ class ItalianDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,7 +30,6 @@ class LithuanianDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Lithuanian(Language):
|
class Lithuanian(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -25,11 +25,6 @@ class NorwegianDefaults(Language.Defaults):
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {
|
|
||||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Norwegian(Language):
|
class Norwegian(Language):
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
Note on noun wordforms / lemmas:
|
|
||||||
All wordforms are extracted from Norsk Ordbank in Norwegian Bokmål 2005, updated 20180627
|
|
||||||
(CLARINO NB - Språkbanken), Nasjonalbiblioteket, Norway:
|
|
||||||
https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
|
|
||||||
|
|
||||||
License:
|
|
||||||
Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,24 +0,0 @@
|
||||||
{
|
|
||||||
"adj": [
|
|
||||||
["e", ""],
|
|
||||||
["ere", ""],
|
|
||||||
["est", ""],
|
|
||||||
["este", ""]
|
|
||||||
],
|
|
||||||
"noun": [
|
|
||||||
["en", "e"],
|
|
||||||
["a", "e"],
|
|
||||||
["et", ""],
|
|
||||||
["er", "e"],
|
|
||||||
["ene", "e"]
|
|
||||||
],
|
|
||||||
"verb": [
|
|
||||||
["er", "e"],
|
|
||||||
["et", "e"],
|
|
||||||
["a", "e"],
|
|
||||||
["es", "e"],
|
|
||||||
["te", "e"],
|
|
||||||
["år", "å"]
|
|
||||||
],
|
|
||||||
"punct": []
|
|
||||||
}
|
|
|
@ -26,12 +26,6 @@ class DutchDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
resources = {
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
|
||||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ....symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||||
|
|
||||||
|
|
||||||
class DutchLemmatizer(object):
|
class DutchLemmatizer(object):
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,55 +0,0 @@
|
||||||
{
|
|
||||||
"adj": [
|
|
||||||
["sten", ""],
|
|
||||||
["ende", "end"],
|
|
||||||
["ste", ""],
|
|
||||||
["st", ""],
|
|
||||||
["er", ""],
|
|
||||||
["en", ""],
|
|
||||||
["e", ""]
|
|
||||||
],
|
|
||||||
"noun": [
|
|
||||||
["heden", "heid"],
|
|
||||||
["elen", "eel"],
|
|
||||||
["ezen", "ees"],
|
|
||||||
["even", "eef"],
|
|
||||||
["ssen", "s"],
|
|
||||||
["rren", "r"],
|
|
||||||
["kken", "k"],
|
|
||||||
["bben", "b"],
|
|
||||||
["'er", ""],
|
|
||||||
["tje", ""],
|
|
||||||
["kje", ""],
|
|
||||||
["ici", "icus"],
|
|
||||||
["en", ""],
|
|
||||||
["ën", ""],
|
|
||||||
["'s", ""],
|
|
||||||
["s", ""]
|
|
||||||
],
|
|
||||||
"verb": [
|
|
||||||
["dden", "den"],
|
|
||||||
["tten", "ten"],
|
|
||||||
["dde", "den"],
|
|
||||||
["tte", "ten"],
|
|
||||||
["end", "en"],
|
|
||||||
["dt", "den"],
|
|
||||||
["de", "en"],
|
|
||||||
["te", "en"]
|
|
||||||
],
|
|
||||||
"num": [
|
|
||||||
["sten", ""],
|
|
||||||
["tjes", ""],
|
|
||||||
["ste", ""],
|
|
||||||
["ën", ""],
|
|
||||||
["en", ""],
|
|
||||||
["de", ""],
|
|
||||||
["er", ""],
|
|
||||||
["ër", ""]
|
|
||||||
],
|
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["‘", "'"],
|
|
||||||
["’", "'"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -27,7 +27,6 @@ class PortugueseDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Portuguese(Language):
|
class Portuguese(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,6 @@ class RomanianDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -21,7 +21,6 @@ class SerbianDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Serbian(Language):
|
class Serbian(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -29,10 +29,6 @@ class SwedishDefaults(Language.Defaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
resources = {
|
|
||||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
|
||||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,103 +0,0 @@
|
||||||
{
|
|
||||||
"noun": [
|
|
||||||
["t", ""],
|
|
||||||
["n", ""],
|
|
||||||
["na", ""],
|
|
||||||
["na", "e"],
|
|
||||||
["or", "a"],
|
|
||||||
["orna", "a"],
|
|
||||||
["et", ""],
|
|
||||||
["en", ""],
|
|
||||||
["en", "e"],
|
|
||||||
["er", ""],
|
|
||||||
["erna", ""],
|
|
||||||
["ar", "e"],
|
|
||||||
["ar", ""],
|
|
||||||
["lar", "el"],
|
|
||||||
["arna", "e"],
|
|
||||||
["arna", ""],
|
|
||||||
["larna", "el"]
|
|
||||||
],
|
|
||||||
"verb": [
|
|
||||||
["r", ""],
|
|
||||||
["de", ""],
|
|
||||||
["t", ""],
|
|
||||||
["er", ""],
|
|
||||||
["te", ""],
|
|
||||||
["a", ""],
|
|
||||||
["e", ""],
|
|
||||||
["t", "d"],
|
|
||||||
["tt", "d"],
|
|
||||||
["tt", ""],
|
|
||||||
["ev", "iv"],
|
|
||||||
["ack", "ick"],
|
|
||||||
["ög", "yg"],
|
|
||||||
["it", ""],
|
|
||||||
["uckit", "ick"],
|
|
||||||
["ugit", "yg"],
|
|
||||||
["it", "et"],
|
|
||||||
["id", "ed"],
|
|
||||||
["ip", "ep"],
|
|
||||||
["iv", "ev"],
|
|
||||||
["in", "en"],
|
|
||||||
["ik", "ek"],
|
|
||||||
["ig", "eg"],
|
|
||||||
["ind", ""],
|
|
||||||
["inn", "ann"],
|
|
||||||
["nder", "nd"],
|
|
||||||
["inner", "inn"],
|
|
||||||
["and", "ind"],
|
|
||||||
["ann", "inn"],
|
|
||||||
["s", ""],
|
|
||||||
["anns", "inn"],
|
|
||||||
["undit", "ind"],
|
|
||||||
["unnit", "inn"],
|
|
||||||
["unnits", "inn"],
|
|
||||||
["uppit", "ipp"],
|
|
||||||
["ungit", "ing"],
|
|
||||||
["öd", "ud"],
|
|
||||||
["öt", "jut"],
|
|
||||||
["öt", "ut"],
|
|
||||||
["ög", "ug"],
|
|
||||||
["ögg", "ugg"],
|
|
||||||
["öng", "ung"],
|
|
||||||
["önk", "unk"],
|
|
||||||
["öt", "yt"],
|
|
||||||
["utit", "yt"],
|
|
||||||
["ös", "ys"],
|
|
||||||
["öv", "yv"],
|
|
||||||
["uvit", "yv"],
|
|
||||||
["öp", "yp"],
|
|
||||||
["upit", "yp"],
|
|
||||||
["ök", "yk"],
|
|
||||||
["ukit", "yk"],
|
|
||||||
["or", "ar"],
|
|
||||||
["öll", "all"],
|
|
||||||
["ät", "åt"],
|
|
||||||
["öll", "åll"],
|
|
||||||
["or", "är"],
|
|
||||||
["urit", "är"],
|
|
||||||
["åt", "ät"],
|
|
||||||
["ar", "är"],
|
|
||||||
["alt", "ält"],
|
|
||||||
["ultit", "ält"]
|
|
||||||
],
|
|
||||||
"adj": [
|
|
||||||
["are", ""],
|
|
||||||
["ast", ""],
|
|
||||||
["re", ""],
|
|
||||||
["st", ""],
|
|
||||||
["ägre", "åg"],
|
|
||||||
["ägst", "åg"],
|
|
||||||
["ängre", "ång"],
|
|
||||||
["ängst", "ång"],
|
|
||||||
["örre", "or"],
|
|
||||||
["örst", "or"]
|
|
||||||
],
|
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["‘", "'"],
|
|
||||||
["’", "'"]
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -24,7 +24,6 @@ class TagalogDefaults(Language.Defaults):
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Tagalog(Language):
|
class Tagalog(Language):
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
{
|
|
||||||
"kaugnayan": "ugnay",
|
|
||||||
"sangkatauhan": "tao",
|
|
||||||
"kanayunan": "nayon",
|
|
||||||
"pandaigdigan": "daigdig",
|
|
||||||
"kasaysayan": "saysay",
|
|
||||||
"kabayanihan": "bayani",
|
|
||||||
"karuwagan": "duwag"
|
|
||||||
}
|
|
|
@ -10,9 +10,6 @@ from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
# Lemma data source:
|
|
||||||
# http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
|
|
||||||
|
|
||||||
|
|
||||||
class TurkishDefaults(Language.Defaults):
|
class TurkishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
@ -22,7 +19,6 @@ class TurkishDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Turkish(Language):
|
class Turkish(Language):
|
||||||
|
|
1333973
spacy/lang/tr/lemma_lookup.json
1333973
spacy/lang/tr/lemma_lookup.json
File diff suppressed because it is too large
Load Diff
|
@ -21,7 +21,6 @@ class UrduDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
|
||||||
|
|
||||||
|
|
||||||
class Urdu(Language):
|
class Urdu(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -25,7 +25,7 @@ from .compat import izip, basestring_
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||||
from .attrs import IS_STOP
|
from .attrs import IS_STOP, LANG
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
|
@ -46,10 +46,15 @@ class BaseDefaults(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lookups(cls, nlp=None):
|
def create_lookups(cls, nlp=None):
|
||||||
root_path = util.get_module_path(cls)
|
root = util.get_module_path(cls)
|
||||||
|
filenames = {name: root / filename for name, filename in cls.resources}
|
||||||
|
if LANG in cls.lex_attr_getters:
|
||||||
|
lang = cls.lex_attr_getters[LANG](None)
|
||||||
|
user_lookups = util.get_entry_point(util.ENTRY_POINTS.lookups, lang, {})
|
||||||
|
filenames.update(user_lookups)
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
for name, filename in cls.resources.items():
|
for name, filename in filenames.items():
|
||||||
data = util.load_language_data(root_path / filename)
|
data = util.load_language_data(filename)
|
||||||
lookups.add_table(name, data)
|
lookups.add_table(name, data)
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
|
@ -168,7 +173,7 @@ class Language(object):
|
||||||
100,000 characters in one text.
|
100,000 characters in one text.
|
||||||
RETURNS (Language): The newly constructed object.
|
RETURNS (Language): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
user_factories = util.get_entry_points("spacy_factories")
|
user_factories = util.get_entry_points(util.ENTRY_POINTS.factories)
|
||||||
self.factories.update(user_factories)
|
self.factories.update(user_factories)
|
||||||
self._meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
self._path = None
|
self._path = None
|
||||||
|
|
|
@ -140,13 +140,6 @@ def lt_tokenizer():
|
||||||
return get_lang_class("lt").Defaults.create_tokenizer()
|
return get_lang_class("lt").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def lt_lemmatizer():
|
|
||||||
lang_cls = get_lang_class("lt")
|
|
||||||
lookups = lang_cls.Defaults.create_lookups()
|
|
||||||
return lang_cls.Defaults.create_lemmatizer(lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return get_lang_class("nb").Defaults.create_tokenizer()
|
return get_lang_class("nb").Defaults.create_tokenizer()
|
||||||
|
@ -157,13 +150,6 @@ def nl_tokenizer():
|
||||||
return get_lang_class("nl").Defaults.create_tokenizer()
|
return get_lang_class("nl").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def nl_lemmatizer(scope="session"):
|
|
||||||
lang_cls = get_lang_class("nl")
|
|
||||||
lookups = lang_cls.Defaults.create_lookups()
|
|
||||||
return lang_cls.Defaults.create_lemmatizer(lookups=lookups)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def pl_tokenizer():
|
def pl_tokenizer():
|
||||||
return get_lang_class("pl").Defaults.create_tokenizer()
|
return get_lang_class("pl").Defaults.create_tokenizer()
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("affaldsgruppernes", "affaldsgruppe"),
|
|
||||||
("detailhandelsstrukturernes", "detailhandelsstruktur"),
|
|
||||||
("kolesterols", "kolesterol"),
|
|
||||||
("åsyns", "åsyn"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma):
|
|
||||||
tokens = da_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -1,20 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("Abgehängten", "Abgehängte"),
|
|
||||||
("engagierte", "engagieren"),
|
|
||||||
("schließt", "schließen"),
|
|
||||||
("vorgebenden", "vorgebend"),
|
|
||||||
("die", "der"),
|
|
||||||
("Die", "der"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
|
|
||||||
tokens = de_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -124,9 +124,3 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].norm_ == norm
|
assert tokens[0].norm_ == norm
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["faster", "fastest", "better", "best"])
|
|
||||||
def test_en_lemmatizer_handles_irreg_adverbs(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert tokens[0].lemma_ in ["fast", "well"]
|
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_fr_lemmatizer_verb(fr_tokenizer):
|
|
||||||
tokens = fr_tokenizer("Qu'est-ce que tu fais?")
|
|
||||||
assert tokens[0].lemma_ == "que"
|
|
||||||
assert tokens[1].lemma_ == "être"
|
|
||||||
assert tokens[5].lemma_ == "faire"
|
|
||||||
|
|
||||||
|
|
||||||
def test_fr_lemmatizer_noun_verb_2(fr_tokenizer):
|
|
||||||
tokens = fr_tokenizer("Les abaissements de température sont gênants.")
|
|
||||||
assert tokens[4].lemma_ == "être"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(
|
|
||||||
reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN"
|
|
||||||
)
|
|
||||||
def test_fr_lemmatizer_noun(fr_tokenizer):
|
|
||||||
tokens = fr_tokenizer("il y a des Costaricienne.")
|
|
||||||
assert tokens[4].lemma_ == "Costaricain"
|
|
||||||
|
|
||||||
|
|
||||||
def test_fr_lemmatizer_noun_2(fr_tokenizer):
|
|
||||||
tokens = fr_tokenizer("Les abaissements de température sont gênants.")
|
|
||||||
assert tokens[1].lemma_ == "abaissement"
|
|
||||||
assert tokens[5].lemma_ == "gênant"
|
|
|
@ -1,20 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("trčao", "trčati"),
|
|
||||||
("adekvatnim", "adekvatan"),
|
|
||||||
("dekontaminacijama", "dekontaminacija"),
|
|
||||||
("filologovih", "filologov"),
|
|
||||||
("je", "biti"),
|
|
||||||
("se", "sebe"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
|
|
||||||
tokens = hr_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -1,20 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
# fmt: off
|
|
||||||
TEST_CASES = [
|
|
||||||
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
|
|
||||||
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
|
|
||||||
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
|
|
||||||
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
|
|
||||||
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
|
|
||||||
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])
|
|
||||||
]
|
|
||||||
# fmt: on
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
|
||||||
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
|
||||||
assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens]
|
|
|
@ -1,143 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
# Calling the Lemmatizer directly
|
|
||||||
# Imitates behavior of:
|
|
||||||
# Tagger.set_annotations()
|
|
||||||
# -> vocab.morphology.assign_tag_id()
|
|
||||||
# -> vocab.morphology.assign_tag_id()
|
|
||||||
# -> Token.tag.__set__
|
|
||||||
# -> vocab.morphology.assign_tag(...)
|
|
||||||
# -> ... -> Morphology.assign_tag(...)
|
|
||||||
# -> self.lemmatize(analysis.tag.pos, token.lex.orth,
|
|
||||||
|
|
||||||
|
|
||||||
noun_irreg_lemmatization_cases = [
|
|
||||||
("volkeren", "volk"),
|
|
||||||
("vaatje", "vat"),
|
|
||||||
("verboden", "verbod"),
|
|
||||||
("ijsje", "ijsje"),
|
|
||||||
("slagen", "slag"),
|
|
||||||
("verdragen", "verdrag"),
|
|
||||||
("verloven", "verlof"),
|
|
||||||
("gebeden", "gebed"),
|
|
||||||
("gaten", "gat"),
|
|
||||||
("staven", "staf"),
|
|
||||||
("aquariums", "aquarium"),
|
|
||||||
("podia", "podium"),
|
|
||||||
("holen", "hol"),
|
|
||||||
("lammeren", "lam"),
|
|
||||||
("bevelen", "bevel"),
|
|
||||||
("wegen", "weg"),
|
|
||||||
("moeilijkheden", "moeilijkheid"),
|
|
||||||
("aanwezigheden", "aanwezigheid"),
|
|
||||||
("goden", "god"),
|
|
||||||
("loten", "lot"),
|
|
||||||
("kaarsen", "kaars"),
|
|
||||||
("leden", "lid"),
|
|
||||||
("glaasje", "glas"),
|
|
||||||
("eieren", "ei"),
|
|
||||||
("vatten", "vat"),
|
|
||||||
("kalveren", "kalf"),
|
|
||||||
("padden", "pad"),
|
|
||||||
("smeden", "smid"),
|
|
||||||
("genen", "gen"),
|
|
||||||
("beenderen", "been"),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
verb_irreg_lemmatization_cases = [
|
|
||||||
("liep", "lopen"),
|
|
||||||
("hief", "heffen"),
|
|
||||||
("begon", "beginnen"),
|
|
||||||
("sla", "slaan"),
|
|
||||||
("aangekomen", "aankomen"),
|
|
||||||
("sproot", "spruiten"),
|
|
||||||
("waart", "zijn"),
|
|
||||||
("snoof", "snuiven"),
|
|
||||||
("spoot", "spuiten"),
|
|
||||||
("ontbeet", "ontbijten"),
|
|
||||||
("gehouwen", "houwen"),
|
|
||||||
("afgewassen", "afwassen"),
|
|
||||||
("deed", "doen"),
|
|
||||||
("schoven", "schuiven"),
|
|
||||||
("gelogen", "liegen"),
|
|
||||||
("woog", "wegen"),
|
|
||||||
("gebraden", "braden"),
|
|
||||||
("smolten", "smelten"),
|
|
||||||
("riep", "roepen"),
|
|
||||||
("aangedaan", "aandoen"),
|
|
||||||
("vermeden", "vermijden"),
|
|
||||||
("stootten", "stoten"),
|
|
||||||
("ging", "gaan"),
|
|
||||||
("geschoren", "scheren"),
|
|
||||||
("gesponnen", "spinnen"),
|
|
||||||
("reden", "rijden"),
|
|
||||||
("zochten", "zoeken"),
|
|
||||||
("leed", "lijden"),
|
|
||||||
("verzonnen", "verzinnen"),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
|
|
||||||
def test_nl_lemmatizer_noun_lemmas_irreg(nl_lemmatizer, text, lemma):
|
|
||||||
pos = "noun"
|
|
||||||
lemmas_pred = nl_lemmatizer(text, pos)
|
|
||||||
assert lemma == sorted(lemmas_pred)[0]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
|
|
||||||
def test_nl_lemmatizer_verb_lemmas_irreg(nl_lemmatizer, text, lemma):
|
|
||||||
pos = "verb"
|
|
||||||
lemmas_pred = nl_lemmatizer(text, pos)
|
|
||||||
assert lemma == sorted(lemmas_pred)[0]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip
|
|
||||||
@pytest.mark.parametrize("text,lemma", [])
|
|
||||||
def test_nl_lemmatizer_verb_lemmas_reg(nl_lemmatizer, text, lemma):
|
|
||||||
# TODO: add test
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip
|
|
||||||
@pytest.mark.parametrize("text,lemma", [])
|
|
||||||
def test_nl_lemmatizer_adjective_lemmas(nl_lemmatizer, text, lemma):
|
|
||||||
# TODO: add test
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip
|
|
||||||
@pytest.mark.parametrize("text,lemma", [])
|
|
||||||
def test_nl_lemmatizer_determiner_lemmas(nl_lemmatizer, text, lemma):
|
|
||||||
# TODO: add test
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip
|
|
||||||
@pytest.mark.parametrize("text,lemma", [])
|
|
||||||
def test_nl_lemmatizer_adverb_lemmas(nl_lemmatizer, text, lemma):
|
|
||||||
# TODO: add test
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,lemma", [])
|
|
||||||
def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma):
|
|
||||||
# TODO: add test
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# Using the lemma lookup table only
|
|
||||||
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
|
|
||||||
def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):
|
|
||||||
lemma_pred = nl_lemmatizer.lookup(text)
|
|
||||||
assert lemma_pred in (lemma, text)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
|
|
||||||
def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):
|
|
||||||
lemma_pred = nl_lemmatizer.lookup(text)
|
|
||||||
assert lemma_pred in (lemma, text)
|
|
|
@ -1,18 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("câini", "câine"),
|
|
||||||
("expedițiilor", "expediție"),
|
|
||||||
("pensete", "pensetă"),
|
|
||||||
("erau", "fi"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
|
|
||||||
tokens = ro_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -1,20 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("најадекватнији", "адекватан"),
|
|
||||||
("матурирао", "матурирати"),
|
|
||||||
("планираћемо", "планирати"),
|
|
||||||
("певају", "певати"),
|
|
||||||
("нама", "ми"),
|
|
||||||
("се", "себе"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
|
|
||||||
tokens = sr_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -1,20 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("DNA-profilernas", "DNA-profil"),
|
|
||||||
("Elfenbenskustens", "Elfenbenskusten"),
|
|
||||||
("abortmotståndarens", "abortmotståndare"),
|
|
||||||
("kolesterols", "kolesterol"),
|
|
||||||
("portionssnusernas", "portionssnus"),
|
|
||||||
("åsyns", "åsyn"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
|
|
||||||
tokens = sv_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -1,21 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"string,lemma",
|
|
||||||
[
|
|
||||||
("evlerimizdeki", "ev"),
|
|
||||||
("işlerimizi", "iş"),
|
|
||||||
("biran", "biran"),
|
|
||||||
("bitirmeliyiz", "bitir"),
|
|
||||||
("isteklerimizi", "istek"),
|
|
||||||
("karşılaştırmamızın", "karşılaştır"),
|
|
||||||
("çoğulculuktan", "çoğulcu"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma):
|
|
||||||
tokens = tr_tokenizer(string)
|
|
||||||
assert tokens[0].lemma_ == lemma
|
|
|
@ -1,15 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4104(en_vocab):
|
|
||||||
"""Test that English lookup lemmatization of spun & dry are correct
|
|
||||||
expected mapping = {'dry': 'dry', 'spun': 'spin', 'spun-dry': 'spin-dry'}
|
|
||||||
"""
|
|
||||||
text = "dry spun spun-dry"
|
|
||||||
doc = get_doc(en_vocab, [t for t in text.split(" ")])
|
|
||||||
# using a simple list to preserve order
|
|
||||||
expected = ["dry", "spin", "spin-dry"]
|
|
||||||
assert [token.lemma_ for token in doc] == expected
|
|
|
@ -37,6 +37,15 @@ _data_path = Path(__file__).parent / "data"
|
||||||
_PRINT_ENV = False
|
_PRINT_ENV = False
|
||||||
|
|
||||||
|
|
||||||
|
class ENTRY_POINTS(object):
|
||||||
|
"""Available entry points to register extensions."""
|
||||||
|
|
||||||
|
factories = "spacy_factories"
|
||||||
|
languages = "spacy_languages"
|
||||||
|
displacy_colors = "spacy_displacy_colors"
|
||||||
|
lookups = "spacy_lookups"
|
||||||
|
|
||||||
|
|
||||||
def set_env_log(value):
|
def set_env_log(value):
|
||||||
global _PRINT_ENV
|
global _PRINT_ENV
|
||||||
_PRINT_ENV = value
|
_PRINT_ENV = value
|
||||||
|
@ -62,7 +71,7 @@ def get_lang_class(lang):
|
||||||
"""
|
"""
|
||||||
global LANGUAGES
|
global LANGUAGES
|
||||||
# Check if an entry point is exposed for the language code
|
# Check if an entry point is exposed for the language code
|
||||||
entry_point = get_entry_point("spacy_languages", lang)
|
entry_point = get_entry_point(ENTRY_POINTS.languages, lang)
|
||||||
if entry_point is not None:
|
if entry_point is not None:
|
||||||
LANGUAGES[lang] = entry_point
|
LANGUAGES[lang] = entry_point
|
||||||
return entry_point
|
return entry_point
|
||||||
|
@ -278,17 +287,19 @@ def get_entry_points(key):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_entry_point(key, value):
|
def get_entry_point(key, value, default=None):
|
||||||
"""Check if registered entry point is available for a given name and
|
"""Check if registered entry point is available for a given name and
|
||||||
load it. Otherwise, return None.
|
load it. Otherwise, return None.
|
||||||
|
|
||||||
key (unicode): Entry point name.
|
key (unicode): Entry point name.
|
||||||
value (unicode): Name of entry point to load.
|
value (unicode): Name of entry point to load.
|
||||||
|
default: Optional default value to return.
|
||||||
RETURNS: The loaded entry point or None.
|
RETURNS: The loaded entry point or None.
|
||||||
"""
|
"""
|
||||||
for entry_point in pkg_resources.iter_entry_points(key):
|
for entry_point in pkg_resources.iter_entry_points(key):
|
||||||
if entry_point.name == value:
|
if entry_point.name == value:
|
||||||
return entry_point.load()
|
return entry_point.load()
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter():
|
def is_in_jupyter():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user